
# Character Network


In [1]:
#imports
import glob2
import pandas as pd
import spacy
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import ssl
nltk.download('punkt')
import json
!pip install pyvis

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi, pyvis
Successfully installed jedi-0.19.1 pyvis-0.3.2


In [3]:
# Extracting subtitles & characters from the saved files
subtitles_paths = sorted(glob2.glob("/content/drive/MyDrive/Colab Notebooks/2024 Data Science Projects/Naruto/Subtitles/*.ass"))
file_path = '/content/drive/MyDrive/Colab Notebooks/2024 Data Science Projects/Naruto/characters.json'

with open(file_path, 'r') as file:
    character_names = [item['character'] for item in json.load(file)]

In [None]:
# Reading the file data (can't use pandas as commas in required text cause issue with parsing)
scripts = []
episode_num = []
for path in subtitles_paths:
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

        # Skip the first 15 lines
        lines = lines[15:]

        # Loop over each line, splitting by comma and taking the 9th column
        rows = [line.split(',')[9] for line in lines if len(line.split(',')) > 9]

        # Remove new line notation and join lines into a single script
        rows = [line.replace("\\N", ' ') for line in rows]
        rows = [line.replace("{\i1}", ' ') for line in rows]
        rows = [line.replace("{\i0}", ' ') for line in rows]
        script = " ".join(rows)

    # Extract the episode number from the filename
    filename = path.split('/')[-1]
    episode = int(filename.split('-')[1].split()[0].strip())
    scripts.append(script)
    episode_num.append(episode)

# Create a DataFrame from the lists
df = pd.DataFrame({'episode': episode_num, 'script': scripts})

In [None]:
# Clean the character names by removing brackets and last names
cleaned_character_names = set()
for name in character_names:
    clean_name = re.sub(r'\s*\(.*?\)\s*', '', name).strip()
    first_name = clean_name.split()[0]
    if len(first_name) > 1:
        cleaned_character_names.add(first_name)

cleaned_character_names = list(cleaned_character_names)

df = pd.DataFrame({'episode': episode_num, 'script': scripts})
# Function to find character names in the script
def get_names(script):
    script_sentences = sent_tokenize(script)
    names_output = []
    # Looping over each sentence to find character names
    for sentence in script_sentences:
        sentence_tokens = word_tokenize(sentence)
        sentence_names = [character for character in cleaned_character_names if character in sentence_tokens]
        names_output.append(sentence_names if sentence_names else [])
    return names_output

# Apply the function only to the first episode
new_df = pd.DataFrame({'script': [df.loc[0, 'script']], 'names': [get_names(df.loc[0, 'script'])]})
new_df

In [None]:
# Applying function to df
df1 = df
df1['names'] = df1['script'].apply(get_names)
df.head()

In [None]:
# Getting list of any time 2 characters appear within 10 sentences
window = 10
entity_relationship = []
for row in df1['names']:
    previous_entities_in_window = []

    #looping over each sentence in a row, taking only previous 10
    for sentence in row:
        previous_entities_in_window.append(sentence)
        previous_entities_in_window = previous_entities_in_window[-window:]
        #flatten the list of list into one row
        previous_entities_flattened = sum(previous_entities_in_window, [])
        #loop over each entity in the current sentence and previous 10
        for entity in sentence:
            for entity_in_window in previous_entities_flattened:
                if entity != entity_in_window:
                    entity_rel = sorted([entity, entity_in_window])
                    entity_relationship.append(entity_rel)

#create a df with entity relationships
relationship_df = pd.DataFrame({'value':entity_relationship})

In [None]:
#returns the first element of the list
relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
#returns the second element of the list
relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
#groupby and count
relationship_df = relationship_df.groupby(['source','target']).count().reset_index()
relationship_df = relationship_df.sort_values('value',ascending=False)

In [None]:
relationship_df.head()

In [None]:
#taking only the first 200 relationships
relationship_df = relationship_df.head(200)

In [None]:
#creating a network map
import networkx as nx
G = nx.from_pandas_edgelist(relationship_df,
                            source = "source",
                            target = "target",
                            edge_attr = "value",
                            create_using = nx.Graph())

In [None]:
#showing the network map
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
pos = nx.kamada_kawai_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
# Import the necessary libraries
from pyvis.network import Network
import networkx as nx
from IPython.core.display import display, HTML

# Create a Pyvis Network object with cdn_resources set to 'in_line'
net = Network(notebook=True, width="1000px", height="700px", bgcolor='#222222', font_color='white', cdn_resources='in_line')

# Calculate node degrees
node_degree = dict(G.degree)

# Set node size attribute
nx.set_node_attributes(G, node_degree, 'size')

# Add nodes and edges from the NetworkX graph to the Pyvis Network
net.from_nx(G)

# Generate and save the network visualization to an HTML file
net.show("naruto.html")

# Display the generated HTML file inline using an iframe
display(HTML('naruto.html'))

#