## Import data & add new metadata 

In [207]:
from pathlib import Path
home = Path.home()
file_dir = home / "balzac_project"/ "balzac_project" / "data"


In [208]:
import pandas as pd
import os
from collections import Counter

os.chdir(file_dir)

nodes_df = pd.read_csv('nodes_vs_export.csv')
nodes = [{'data': row.to_dict()} for _, row in nodes_df.iterrows()]
net_df = pd.read_csv('character_frequencies.csv')
metadata_df = pd.read_csv('character_metadata.csv')



In [209]:
# merge new metadata into nodes_df

metadata_df = metadata_df.rename(columns={"Character": "id"})
nodes_df = nodes_df.reset_index()
metadata_df = metadata_df.reset_index()
nodes_df = nodes_df.merge(metadata_df, on="id", how="left")

In [210]:
nodes_df.drop(columns=['Name Variants', 'index_y', 'Major Appearances', 'Minor Appearances', 'index_x'], inplace=True)

In [211]:
nodes_df['Social Class'] = nodes_df['Social Class'].str.strip()
nodes_df['Gender'] = nodes_df['Gender'].str.strip()

In [212]:
nodes_df

Unnamed: 0,id,label,type,color,shape,size,scene,Gender,Social Class,Character Description
0,Andoche Finot,Andoche Finot,character,skyblue,dot,25,,male,Bourgeoisie,Newspaper editor and businessman\nSon of a hat...
1,A PRINCE OF BOHEMIA,A PRINCE OF BOHEMIA,novel,#FF6F61,square,40,SCENES DE LA VIE PARISIENNE,,,
2,THE DEPUTY OF ARCIS,THE DEPUTY OF ARCIS,novel,goldenrod,square,50,SCENES DE LA VIE POLITIQUE,,,
3,A DAUGHTER OF EVE,A DAUGHTER OF EVE,novel,blue,square,50,SCENES DE LA VIE PRIVEE,,,
4,URSULA,URSULA,novel,sandybrown,square,40,SCENES DE LA VIE PROVINCE,,,
...,...,...,...,...,...,...,...,...,...,...
105,Madame Roguin,Madame Roguin,character,skyblue,dot,10,,female,Bourgeoisie,Wealthy provinical bourgeouisie \nWife of Rogu...
106,VENDETTA,VENDETTA,novel,blue,square,10,SCENES DE LA VIE PRIVEE,,,
107,AT THE SIGN OF THE CAT AND RACKET,AT THE SIGN OF THE CAT AND RACKET,novel,blue,square,10,SCENES DE LA VIE PRIVEE,,,
108,Vicomtesse de Beauséant,Vicomtesse de Beauséant,character,skyblue,dot,20,,female,Old Aristocracy,"Cousin of Rastignac\nEventually Comtesse, then..."


## Update graph HTML with beautiful soup 

In [330]:
nodes_df

Unnamed: 0_level_0,label,type,color,shape,size,scene,gender,social_class,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Andoche Finot,Andoche Finot,character,skyblue,dot,25,,male,Bourgeoisie,Newspaper editor and businessman\nSon of a hat...
A PRINCE OF BOHEMIA,A PRINCE OF BOHEMIA,novel,#FF6F61,square,40,SCENES DE LA VIE PARISIENNE,,,
THE DEPUTY OF ARCIS,THE DEPUTY OF ARCIS,novel,goldenrod,square,50,SCENES DE LA VIE POLITIQUE,,,
A DAUGHTER OF EVE,A DAUGHTER OF EVE,novel,blue,square,50,SCENES DE LA VIE PRIVEE,,,
URSULA,URSULA,novel,sandybrown,square,40,SCENES DE LA VIE PROVINCE,,,
...,...,...,...,...,...,...,...,...,...
Madame Roguin,Madame Roguin,character,skyblue,dot,10,,female,Bourgeoisie,Wealthy provinical bourgeouisie \nWife of Rogu...
VENDETTA,VENDETTA,novel,blue,square,10,SCENES DE LA VIE PRIVEE,,,
AT THE SIGN OF THE CAT AND RACKET,AT THE SIGN OF THE CAT AND RACKET,novel,blue,square,10,SCENES DE LA VIE PRIVEE,,,
Vicomtesse de Beauséant,Vicomtesse de Beauséant,character,skyblue,dot,20,,female,Old Aristocracy,"Cousin of Rastignac\nEventually Comtesse, then..."


In [218]:
graph_dir = home / "balzac_project" / "balzac_project" 
os.chdir(graph_dir)

In [331]:
nodes_df = pd.read_csv("updated_nodes.csv", quotechar='"', engine='python')

In [332]:
nodes_df.set_index("id", inplace=True)

In [333]:
from bs4 import BeautifulSoup
# open HTML file
with open("balzac_character_network_with_filters.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

script_tag = None
for s in soup.find_all("script"):
    if "nodes = new vis.DataSet" in s.text:
        script_tag = s
        break

script_text = script_tag.string

In [337]:
import regex as re
import json

# search for nodes
pattern = r"new vis\.DataSet\(\s*(\[\{.*?\}\])\s*\)"
match = re.search(pattern, script_text, re.DOTALL)
if match:
    nodes_json_str = match.group(1)
    nodes_json_str = nodes_json_str.replace("NaN", "null").replace("undefined", "null") # replace NaN with null
    nodes_list = json.loads(nodes_json_str)
else:
    raise ValueError("Could not find vis.DataSet array")


# add new metadata
for node in nodes_list:
    node_id = node['id']
    if node_id in nodes_df.index:
        gender = nodes_df.at[node_id, 'gender']
        social_class = nodes_df.at[node_id, 'social_class']
        title = nodes_df.at[node_id, 'title']
        node['title'] = None if pd.isna(title) else title
        node['gender'] = None if pd.isna(gender) else gender
        node['social_class'] = None if pd.isna(social_class) else social_class
    
    else:
        node['gender'] = None
        node['social_class'] = None
        node['title'] = None

    for k, v in node.items(): 
        if pd.isna(v):
            node[k] = None

In [346]:
def fix_line_breaks(s):
    if not s or pd.isna(s):
        return None
    s = str(s)  # ensure it's a string
    s = s.replace("\\", "\\\\")  # escape backslashes first
    s = s.replace('"', '\\"')    # escape quotes
    s = s.replace("\n", "\\n")   # escape newlines
    return s

for node in nodes_list:
    node['title'] = fix_line_breaks(node.get('title'))

        




In [347]:
nodes_json_str = json.dumps(nodes_list, ensure_ascii=False)

# replace the original nodes
updated_script_text = re.sub(
    pattern,
    f"nodes = new vis.DataSet({nodes_json_str});",
    script_text,
    flags=re.DOTALL
)

# update the script tag 
script_tag.string.replace_with(updated_script_text)

# write new file
with open("balzac_character_network_test.html", "w", encoding="utf-8") as f:
    f.write(str(soup))

In [348]:
nodes_list

[{'color': '#5a7684',
  'id': 'Andoche Finot',
  'label': 'Andoche Finot',
  'scene': None,
  'shape': 'dot',
  'size': 30,
  'type': 'character',
  'title': 'Newspaper editor and businessman\\\\\\\\nSon of a hatter\\\\\\\\nEventually ascends to newspaper owner and a commerical force\\\\\\\\nUsually features with journalists or financiers (Gaudissart, des Lupeaulx)',
  'gender': 'male',
  'social_class': 'Bourgeoisie'},
 {'color': ' #1a1a40',
  'id': 'A PRINCE OF BOHEMIA',
  'label': 'A PRINCE OF BOHEMIA',
  'scene': 'SCENES DE LA VIE PARISIENNE',
  'shape': 'square',
  'size': 50,
  'type': 'novel',
  'title': None,
  'gender': None,
  'social_class': None},
 {'color': '#631d1d',
  'id': 'THE DEPUTY OF ARCIS',
  'label': 'THE DEPUTY OF ARCIS',
  'scene': 'SCENES DE LA VIE POLITIQUE',
  'shape': 'square',
  'size': 60,
  'type': 'novel',
  'title': None,
  'gender': None,
  'social_class': None},
 {'color': '#9b8579',
  'id': 'A DAUGHTER OF EVE',
  'label': 'A DAUGHTER OF EVE',
  'scen