In [None]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx

import matplotlib.pyplot as plt

In [None]:
NER = spacy.load('it_core_news_sm')

In [None]:
script= 'data/FiloAct3.txt'
script_text = open(script, encoding='utf-8').read()
script_doc = NER(script_text)

In [None]:
# quick test - identified entities
displacy.render(script_doc[0:2000], style="ent", jupyter=True)

In [None]:
# Read characters
character_df = pd.read_csv("data/characters.csv")
#character_df

In [None]:
import re 
character_df['character'] = character_df['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x)) 
character_df['character_firstname'] = character_df['character'].apply(lambda x: x.split(' ', 1)[0])

In [None]:
sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in script_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)

In [None]:
sent_entity_df


In [None]:
# Function to filter out non-character entities
def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list 
            if ent in list(character_df.character) 
            or ent in list(character_df.character_firstname)]

In [None]:
filter_entity(["Filomena", "Thu", "2"], character_df)

In [None]:
sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))

# Filter out sentences that don't have any character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]
sent_entity_df_filtered.head(10)

In [None]:
# Take only first name of characters
sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0]
                                                                                                               for item in x])


In [None]:
pd.reset_option('^display.', silent=True)
sent_entity_df_filtered

In [None]:
window_size = 5
relationships = []

for i in range(sent_entity_df_filtered.index[-1]):
    end_i = min(i+5, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [None]:
relationship_df = pd.DataFrame(relationships)

In [None]:
pd.set_option('display.max_rows', None)
relationship_df

In [None]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df

In [None]:
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [None]:
relationship_df

In [None]:
#relationship_df.to_csv
relationship_df.to_csv('output.csv', index=False, encoding='utf-8')

In [None]:
# Create a graph from a pandas dataframe
G = nx.from_pandas_edgelist(relationship_df, 
                            source = "source", 
                            target = "target", 
                            edge_attr = "value", 
                            create_using = nx.Graph())

In [None]:

plt.rcParams["figure.figsize"] = (5,5)
pos = nx.circular_layout(G)
nx.draw_networkx(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()


#plt.rcParams["figure.figsize"] = (10,8)

In [None]:
from pyvis.network import Network
net = Network(notebook = True, width="1000px", height="700px", bgcolor='#ffffff', font_color='black')

node_degree = dict(G.degree)

#Setting up node size attribute
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("Filumena.html")

In [None]:
# Degree centrality
degree_dict = nx.degree_centrality(G)
degree_dict
#number of times the circles touches the edges is the degree
#in the dictionary we see that gandalf should be the largest node in our graph


In [None]:
degree_df = pd.DataFrame.from_dict(degree_dict, orient='index', columns=['centrality'])
# Plot top 10 nodes
degree_df.sort_values('centrality', ascending=False).plot(kind="bar")

In [None]:
import pandas as pd
import networkx as nx

# Assuming you have already created the graph G

# Calculate degree centrality
degree_dict = nx.degree_centrality(G)

# Create a DataFrame from the degree centrality dictionary
degree_df = pd.DataFrame.from_dict(degree_dict, orient='index', columns=['Centrality'])

# Sort the DataFrame by degree centrality in descending order
degree_df = degree_df.sort_values('Centrality', ascending=False)

# Reset the index to use the node IDs as a column
degree_df.reset_index(inplace=True)
degree_df = degree_df.rename(columns={'index': 'Node ID'})

# Display the DataFrame
print(degree_df)


In [None]:
# Betweenness centrality
betweenness_dict = nx.betweenness_centrality(G)
betweenness_df = pd.DataFrame.from_dict(betweenness_dict, orient='index', columns=['centrality'])
# Plot top 10 nodes
betweenness_df.sort_values('centrality', ascending=False).plot(kind="bar")

In [None]:
import pandas as pd
import networkx as nx

# Assuming you have already created the graph G

# Calculate degree centrality
degree_dict = nx.betweenness_centrality(G)

# Create a DataFrame from the degree centrality dictionary
degree_df = pd.DataFrame.from_dict(betweenness_dict, orient='index', columns=['Centrality'])

# Sort the DataFrame by degree centrality in descending order
degree_df = degree_df.sort_values('Centrality', ascending=False)

# Reset the index to use the node IDs as a column
degree_df.reset_index(inplace=True)
degree_df = degree_df.rename(columns={'index': 'Node ID'})

# Display the DataFrame
print(degree_df)


In [None]:
# Closeness centrality
closeness_dict = nx.closeness_centrality(G)
closeness_df = pd.DataFrame.from_dict(closeness_dict, orient='index', columns=['centrality'])
# Plot top 10 nodes
closeness_df.sort_values('centrality', ascending=False).plot(kind="bar")

In [None]:
import pandas as pd
import networkx as nx

# Assuming you have already created the graph G

# Calculate degree centrality
degree_dict = nx.closeness_centrality(G)

# Create a DataFrame from the degree centrality dictionary
degree_df = pd.DataFrame.from_dict(closeness_dict, orient='index', columns=['Centrality'])

# Sort the DataFrame by degree centrality in descending order
degree_df = degree_df.sort_values('Centrality', ascending=False)

# Reset the index to use the node IDs as a column
degree_df.reset_index(inplace=True)
degree_df = degree_df.rename(columns={'index': 'Node ID'})

# Display the DataFrame
print(degree_df)


In [None]:
import community as community_louvain

In [None]:
communities = community_louvain.best_partition(G)

In [None]:
communities

In [None]:
nx.set_node_attributes(G, communities, 'group')

In [None]:
com_net = Network(notebook = True, width="1000px", height="700px", bgcolor='#222222', font_color='white')
com_net.from_nx(G)
com_net.show("Filumena_communities.html")