In [1]:
import os
from bs4 import BeautifulSoup
import csv
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import pandas as pd

In [2]:
# Mount Google Drive (optional, if you're using Google Drive)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_dir = '/content/input/'
output_dir = '/content/output'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [4]:
def read_xml_and_create_matrix(file_path):
    # Read the XML file
    with open(file_path, 'r', encoding='utf-8') as file:
        contents = file.read()

    # Parse the XML with BeautifulSoup
    soup = BeautifulSoup(contents, 'xml')

    # Dictionary to store the count of interactions
    interactions = {}

    # Iterate over the dialogue elements
    for dialogue in soup.find_all('dialogue'):
        speaker = dialogue.find('speaker').text if dialogue.find('speaker') else None
        receiver = dialogue.find('receiver').text if dialogue.find('receiver') else None

        if not speaker or not receiver:
            continue

        # Check and initialize if speaker or receiver is not in dictionary
        if speaker not in interactions:
            interactions[speaker] = {}
        if receiver not in interactions[speaker]:
            interactions[speaker][receiver] = 0

        # Increment the count of interaction
        interactions[speaker][receiver] += 1

    # Generate a list of unique speakers
    speakers = list(set([dialogue.find('speaker').text for dialogue in soup.find_all('dialogue') if dialogue.find('speaker')]))

    # Convert the dictionary into a matrix
    matrix = []
    for speaker in speakers:
        row = [interactions.get(speaker, {}).get(recv, 0) for recv in speakers]
        matrix.append(row)

    return speakers, matrix

def save_matrix_to_csv(speakers, matrix, output_file, suffix="_matrix"):
    # Create the output file name by appending the suffix and .csv extension
    output_file = f"{output_file}{suffix}.csv"

    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Speaker/Receiver'] + speakers)  # Header row
        for i, row in enumerate(matrix):
            writer.writerow([speakers[i]] + row)

def create_network_from_matrix(speakers, matrix):
    G = nx.Graph()

    # Iterate over the matrix to add edges
    for i, speaker in enumerate(speakers):
        for j, count in enumerate(matrix[i]):
            receiver = speakers[j]
            if count > 0 and speaker != receiver:  # Check to avoid self-loops
                G.add_edge(speaker, receiver, weight=count)

    return G

def visualize_network_interactive(G, output_file, suffix="_network"):
    pos = nx.spring_layout(G)

    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(x=edge_x, y=edge_y,
                            line=dict(width=0.5, color='#888'),
                            hoverinfo='none',
                            mode='lines')

    node_x = []
    node_y = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

    node_trace = go.Scatter(x=node_x, y=node_y,
                            mode='markers+text',
                            text=[node for node in G.nodes()],
                            textposition="top center",
                            hoverinfo='text',
                            marker=dict(showscale=True,
                                        colorscale='HOT',
                                        reversescale=True,
                                        color=[],
                                        size=10,
                                        colorbar=dict(thickness=15,
                                                      title='Number of Interactions',
                                                      xanchor='left',
                                                      titleside='right')))

    node_adjacencies = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))

    node_trace.marker.color = node_adjacencies

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(showlegend=False,
                                      hovermode='closest',
                                      margin=dict(b=20, l=5, r=5, t=40),
                                      xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                                      yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    output_file = f"{output_file}{suffix}.html"

    # Save to HTML
    pio.write_html(fig, file=output_file)

def count_words_in_line(line):
    return len(line.split())

def read_xml_and_create_word_count_matrix(file_path):
    # Read the XML file
    with open(file_path, 'r', encoding='utf-8') as file:
        contents = file.read()

    # Parse the XML with BeautifulSoup
    soup = BeautifulSoup(contents, 'xml')

    # Dictionary to store the word count of interactions
    word_count_interactions = {}

    # Iterate over the dialogue elements
    for dialogue in soup.find_all('dialogue'):
        speaker = dialogue.find('speaker').text if dialogue.find('speaker') else None
        receiver = dialogue.find('receiver').text if dialogue.find('receiver') else None
        line = dialogue.find('line').text if dialogue.find('line') else ""

        if not speaker or not receiver:
            continue

        # Initialize dictionary entries if not present
        if speaker not in word_count_interactions:
            word_count_interactions[speaker] = {}
        if receiver not in word_count_interactions[speaker]:
            word_count_interactions[speaker][receiver] = 0

        # Count the words and add to the corresponding speaker-receiver pair
        word_count_interactions[speaker][receiver] += count_words_in_line(line)

    # Generate a list of unique speakers
    speakers = list(set([dialogue.find('speaker').text for dialogue in soup.find_all('dialogue') if dialogue.find('speaker')]))

    # Convert the dictionary into a matrix
    matrix = []
    for speaker in speakers:
        row = [word_count_interactions.get(speaker, {}).get(recv, 0) for recv in speakers]
        matrix.append(row)

    return speakers, matrix

def girvan_newman_community_detection(G):
    communities_generator = nx.community.girvan_newman(G)
    top_level_communities = next(communities_generator)
    partition = {node: cid for cid, community in enumerate(top_level_communities) for node in community}
    return partition

def visualize_communities(G, partition, output_file, suffix="_communities"):
    # Position the nodes using the spring layout algorithm
    pos = nx.spring_layout(G)

    # Create empty lists to store the node and edge data
    edge_x = []
    edge_y = []
    node_x = []
    node_y = []
    node_color = []

    # Iterate over the edges to prepare edge traces
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Create an edge trace
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    # Iterate over the nodes to prepare node traces
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_color.append(partition[node])

    # Create a node trace
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=[node for node in G.nodes()],
        textposition="top center",
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='Viridis',
            color=node_color,
            size=10,
            line_width=2))

    # Add node labels
    node_trace.text = [str(node) for node in G.nodes()]

    # Create a figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='<br>Network graph with communities',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )

    output_file = f"{output_file}{suffix}.html"
    fig.write_html(output_file)  # Save as HTML

def calculate_plot_save_centrality(G, output_file, suffix=""):
    # Calculate centrality scores
    centrality_scores = {
        'Degree Centrality': nx.degree_centrality(G),
        'Betweenness Centrality': nx.betweenness_centrality(G),
        'Closeness Centrality': nx.closeness_centrality(G),
        'Eigenvector Centrality': nx.eigenvector_centrality(G, max_iter=1000)
    }

    # Convert centrality scores to DataFrame for plotting and saving
    df = pd.DataFrame(centrality_scores)
    df.index.name = 'Node'
    df.reset_index(inplace=True)
    df_melted = df.melt(id_vars=['Node'], var_name='Centrality Type', value_name='Score')

    # Plot the centrality scores
    fig = px.bar(df_melted, x='Node', y='Score', color='Centrality Type', barmode='group')
    fig.update_layout(title_text='Centrality Scores of Nodes in the Network')
    plot_file = f"{output_file}{suffix}_centrality_plot.html"
    fig.write_html(plot_file)

    # Save the centrality scores to a CSV file
    csv_file = f"{output_file}{suffix}_centrality_scores.csv"
    df.to_csv(csv_file, index=False)


In [8]:
for file_name in [f for f in os.listdir(data_dir) if f.endswith('.xml')]:
    print(f"Processing {file_name}")
    speakers, matrix = read_xml_and_create_matrix(os.path.join(data_dir, file_name))
    output_file = os.path.join(output_dir, file_name.split('.')[0])
    save_matrix_to_csv(speakers, matrix, output_file)
    G = create_network_from_matrix(speakers, matrix)
    visualize_network_interactive(G, output_file)

    speakers_words, matrix_words = read_xml_and_create_word_count_matrix(os.path.join(data_dir, file_name))
    save_matrix_to_csv(speakers_words, matrix_words, output_file, "_words_matrix")
    G_words = create_network_from_matrix(speakers_words, matrix_words)
    visualize_network_interactive(G_words, output_file, "_words_network")

    partition = girvan_newman_community_detection(G)
    partition_words = girvan_newman_community_detection(G_words)
    visualize_communities(G, partition, output_file)
    visualize_communities(G_words, partition_words, output_file, "_words_communities")

    calculate_plot_save_centrality(G, output_file)
    calculate_plot_save_centrality(G_words, output_file, "_words_combined_centrality_scores")


Processing Age_of_Ultron_xml.xml
Processing Infinity_War_xml.xml
