In [31]:
import os 
import sys 
import pathlib  
import spacy
import pandas as pd 
import matplotlib.pyplot as plt
import networkx as nx  
from pyvis.network import Network
from nltk import sent_tokenize
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from Utils import load_subs_datasets

In [32]:
def load_model():
    return spacy.load('en_core_web_trf')

nlp_model = load_model()

dataset_path = "../data/Subtitles/"
df = load_subs_datasets(dataset_path)

In [6]:
sample_script = df.iloc[0]['script']
sentence = sent_tokenize(sample_script)
sentence = ".".join(sentence)
doc = nlp_model(sentence)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [7]:
for entity in doc.ents:
    print(entity.text, entity.label_)

nine CARDINAL
Ninjas NORP
Fourth ORDINAL
Ninja NORP
Ninja PERSON
Fourth ORDINAL
Naruto PERSON
lruka Sensei?.The PERSON
Naruto PERSON
the Transformation Jutsu!.Even PRODUCT
Haruno PERSON
Sasuke Uchiha PERSON
Naruto Uzumaki PERSON
Naruto PERSON
Naruto PERSON
tonight TIME
Naruto Uzumaki!.Naruto PERSON
Ninja NORP
Fourth ORDINAL
nine CARDINAL
Leaf PERSON
Ninja Academy ORG
Tomorrow DATE
One more bowl QUANTITY
three or more CARDINAL
Naruto PERSON
Ninja NORP
Ninja NORP
Iruka PERSON
Iruka Sensei PERSON
Naruto PERSON
Iruka Sensei!.What PERSON
Hokage PERSON
Naruto PERSON
the Scroll of Sealing WORK_OF_ART
The Scroll of Sealing?!.Let WORK_OF_ART
first ORDINAL
First ORDINAL
Naruto PERSON
Naruto PERSON
the Scroll of Sealing WORK_OF_ART
one CARDINAL
one CARDINAL
Jutsu WORK_OF_ART
Mizuki PERSON
Naruto PERSON
Mizuki PERSON
Iruka PERSON
12 years ago DATE
Naruto PERSON
Iruka PERSON
Iruka PERSON
Naruto PERSON
Naruto PERSON
the Scroll of Sealing WORK_OF_ART
Naruto PERSON
Naruto PERSON
Mizuki PERSON
The Scro

In [8]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)
    ner_output = []
    
    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ == 'PERSON':
                full_name = entity.text
                first_name = entity.text.split()[0]
                first_name = first_name.strip()
                ners.add(first_name)
                
        ner_output.append(ners)
    return ner_output

In [9]:
df = df.head(10)
df['ners'] = df['script'].apply(get_ners_inference)

In [24]:
def generate_character_network(df):
    windows = 10 
    entity_relationship = []
    
    for row in df['ners']:
        previous_entities = []
        
        for sentence in row:
            previous_entities.append(list(sentence))
            previous_entities = previous_entities[-windows:]
            
            previous_entities_flattened = sum(previous_entities, [])
            
            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))
                        
                        
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)
    return relationship_df        
            

In [None]:
relation_df = generate_character_network(df)
relationship_df = relation_df.sort_values('value', ascending=False)
relation_df = relation_df.head(200)

In [30]:
G = nx.from_pandas_edgelist(relation_df, 'source', 'target', edge_attr ='value', create_using=nx.Graph())


net = Network(notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white", cdn_resources="remote") 
node_degrees = dict(G.degree)
nx.set_node_attributes(G, node_degrees, 'size')
net.from_nx(G)
net.show("character_network.html")

character_network.html
