# Named Entity recognition

In [3]:
import spacy #type:ignore
from nltk import sent_tokenize

In [8]:
!python -m spacy download en_core_web_trf

^C


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.3/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.3/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.3/457.4 MB ? eta -:--:--
     -------------------------------------- 0.5/457.4 MB 409.0 kB/s eta 0:18:38
     -----

In [12]:
! python -m spacy info


[1m

spaCy version    3.8.2                         
Location         c:\Users\yashr\anaconda3\envs\sl\Lib\site-packages\spacy
Platform         Windows-10-10.0.26100-SP0     
Python version   3.11.10                       
Pipelines        en_core_web_trf (3.8.0)       



# Load model

In [13]:
def load_model():
    nlp = spacy.load('en_core_web_trf')
    return nlp

In [15]:
import spacy
spacy.cli.download('en_core_web_trf')
nlp = spacy.load('en_core_web_trf')

In [None]:
nlp_model = load_model()

# load Dataset

In [None]:
import os 
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [None]:
dataset_path = "../data/Subtitles/"
df = load_subtitles_dataset(dataset_path)

In [None]:
df.head()

In [None]:
sample_script = df.iloc[0]['script']
sample_script

In [None]:
sentences = sent_tokenize(sample_script)

In [None]:
sentences = sentences[60:90]

In [None]:
sentence = ".".join(sentences)

In [None]:

sentence

# Run Model

In [None]:
doc = nlp_model(sentence)

In [None]:

doc.ents

In [None]:
for entity in doc.ents:
    print(entity, entity.label_)

In [None]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)

    ner_output = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ =="PERSON":
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)

    return ner_output

In [None]:

df = df.head(10)

In [None]:
df

In [None]:
df['ners'] = df['script'].apply(get_ners_inference)

In [None]:
df

# Character Network

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [None]:
def generate_character_network(df):

    windows=10
    entity_relationship = []

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-windows:]

            # Flatten 2D List into 1D List
            previous_entities_flattened = sum(previous_entities_in_window, [])

            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))
    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df

In [None]:
relationship_df = generate_character_network(df)

In [None]:
relationship_df

In [None]:
relationship_df = relationship_df.sort_values('value', ascending=False)
relationship_df = relationship_df.head(200)