# Libraries

In [104]:
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Functions

### Build Graph (User - Video) from dataframe

In [None]:
def build_graph_from_df(df):
    # Get all videos with at least one valid pair
    valid_videos = df['videoID'].unique()

    # Get all users involved in those videos
    users_involved = pd.unique(df[['userID_1', 'userID_2']].values.ravel())

    G = nx.Graph()
    
    # Add user nodes
    for user in users_involved:
        G.add_node(user, type='user')

    # Add video nodes
    for video in valid_videos:
        G.add_node(video, type='video')

    # Getting user and videos dataframe
    user1_video_df = df[['videoID', 'userID_1']].drop_duplicates()
    user2_video_df = df[['videoID', 'userID_2']].drop_duplicates()
    user2_video_df.rename(columns={'userID_2': 'userID_1'}, inplace=True)
    user_video_df = pd.concat([user1_video_df, user2_video_df]).drop_duplicates()
    user_video_df.rename(columns={'userID_1': 'user', 'videoID': 'video'}, inplace=True)

    # Adding edges
    for _, row in user_video_df.iterrows():
        user = row['user']
        video = row['video']
        G.add_edge(user, video)

    # Getting user interaction
    user_interactions = df[['userID_1', 'userID_2']].drop_duplicates()

    # Adding edges
    for _, row in user_interactions.iterrows():
        user1 = row['userID_1']
        user2 = row['userID_2']
        G.add_edge(user1, user2)
    
    return G

### Degree variance without outliers

In [86]:
def degree_variance_without_outliers(G):
    # Get degrees of all nodes
    degrees = np.array([d for n, d in G.degree()])
    
    if len(degrees) == 0:
        return 0  # or np.nan if you prefer
    
    # Calculate IQR
    q1 = np.percentile(degrees, 25)
    q3 = np.percentile(degrees, 75)
    iqr = q3 - q1
    
    # Define bounds for outliers
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    # Filter out outliers
    filtered_degrees = degrees[(degrees >= lower_bound) & (degrees <= upper_bound)]
    
    # Calculate variance of the filtered degrees
    return np.var(filtered_degrees)

### Get graph metrics

In [128]:
def get_graph_metrics(G):
    
    # Number of nodes
    num_nodes = G.number_of_nodes()
    
    # Number of edges
    num_edges = G.number_of_edges()
    
    # Degree distribution
    degrees = [d for n, d in G.degree()]
    mean_degree = np.mean(degrees)
    var_degree = degree_variance_without_outliers(G)

    # Average clustering coefficient
    clustering = nx.average_clustering(G)

    # Components
    components = nx.connected_components(G)
    num_components = len(list(components))

    # Density
    density = nx.density(G)

    if nx.is_connected(G):
        # Average shortest path length
        avg_path_length = nx.average_shortest_path_length(G)
        # Diameter
        diameter = nx.diameter(G)
        # Modularity
        communities = nx.community.greedy_modularity_communities(G)
        modularity = nx.community.modularity(G, communities)
        assortativity = nx.degree_assortativity_coefficient(G)

    else:
        avg_path_length = np.nan
        diameter = np.nan
        modularity = np.nan

    metrics = {
        "num_nodes": num_nodes,
        "num_edges": num_edges,
        "num_components": num_components,
        "mean_degree": mean_degree,
        "var_degree": var_degree,
        "density": density,
        "diameter": diameter,
        "clustering": clustering,
        "avg_path_length": avg_path_length,
        "modularity": modularity,
        "assortativity": assortativity,
    }

    return metrics

### Get components dataframe

In [59]:
def get_components_dataframe(G):
    # Create a list to store component data
    components = list(nx.connected_components(G))
    component_data = []

    # Analyze each component
    for i, component in enumerate(components):
        # Create subgraph for the current component
        subgraph = G.subgraph(component)
        
        # Get component statistics
        component_data.append({
            'component_id': i + 1,
            'num_nodes': len(component),
            'num_edges': subgraph.number_of_edges()
        })

    # Create DataFrame
    component_df = pd.DataFrame(component_data)

    # Sort by component size (optional)
    component_df = component_df.sort_values('num_nodes', ascending=False).reset_index(drop=True)

    return component_df


### Plot component sizes

In [None]:
def plot_component_sizes(df_components):
    
    # Sort components by size in descending order
    df_components = df_components.sort_values('num_nodes', ascending=False).reset_index(drop=True)

    # Create the figure
    fig = go.Figure()

    # Add bars for nodes (using log scale)
    fig.add_trace(go.Bar(
        x=df_components.index + 1,
        y=df_components['num_nodes'],
        name='Number of Nodes',
        marker_color='skyblue',
        text=df_components['num_nodes'],
        textposition='auto'
    ))

    # Update y-axis to use log scale but show original values
    fig.update_layout(
        title='Component Sizes (Log Scale)',
        xaxis_title='Component Rank',
        yaxis=dict(
            title='Number of Nodes',
            type='log',
            tickvals=[1, 10, 100, 1000, 10000, 100000],  # Custom tick positions
            ticktext=['1', '10', '100', '1K', '10K', '100K'],  # Custom tick labels
            showgrid=True
        ),
        template="plotly_white",
        showlegend=False,
        height=600
    )

    # Add hover template to show exact values
    fig.update_traces(
        hovertemplate='<b>Component Rank</b>: %{x}<br>' +
                    '<b>Nodes</b>: %{y:,}<br>' +
                    '<extra></extra>'
    )

    fig.show()

### Plot component metrics

In [108]:
def plot_component_metrics(components, metrics_list):
    component_metrics = []
    for i, component in enumerate(components):
        metrics = get_graph_metrics(component)
        component_metrics.append(metrics)
        
    data = []
    for i, obj in enumerate(component_metrics):
        new_obj = {
            'object_id': i
        }
        for metric in metrics_list:
            new_obj[metric] = obj[metric]
        
        data.append(new_obj)

    df_metrics = pd.DataFrame(data)

    # Get metrics (exclude non-numeric and id columns)
    metrics = [col for col in df_metrics.columns if col != 'object_id' and pd.api.types.is_numeric_dtype(df_metrics[col])]

    # Create subplots
    fig = make_subplots(rows=1, cols=len(metrics), subplot_titles=metrics)

    # Add a box trace for each metric
    for i, metric in enumerate(metrics, 1):
        fig.add_trace(
            go.Box(y=df_metrics[metric], name=metric),
            row=1, col=i
        )

    # Update layout
    fig.update_layout(
        title_text="Distribution of Metrics",
        showlegend=False,
        height=500,
        width=200 * len(metrics)  # Adjust width based on number of metrics
    )

    # Update y-axes to be independent
    for i in range(1, len(metrics) + 1):
        fig.update_yaxes(title_text="Value", row=1, col=i)

    fig.show()

### Get top components

In [111]:
def get_top_components(G, top):
    components = list(nx.connected_components(G))

    # Sort components by size (largest first) and take top
    top_components = sorted(components, key=len, reverse=True)[:top]

    # Create a list to store component data
    components = []

    for i, component in enumerate(top_components, 1):
        subgraph = G.subgraph(component)
        components.append(nx.Graph(subgraph))

    return components

### Build Barabási-Albert from components

In [122]:
def build_ba_from_components(G_components):
    G_ba_networks = []
    for component in G_components:
        n = component.number_of_nodes()
        avg_degree = component.number_of_edges() / n
        m = int(round(avg_degree))
        G_ba = nx.barabasi_albert_graph(n, m)

        G_ba_networks.append(G_ba)
    return G_ba_networks

# Introduction

# Data Loading and Preprocessing

## Loading Data

In [None]:
# Loanding Data
df = pd.read_csv("pairwise_52seconds_share.csv")
df.head()

4471

## Preprocessing

In [56]:
# Removing records where userID_1 is equal to userID_2
df = df[df['userID_1'] != df['userID_2']]
df.head()

Unnamed: 0.1,Unnamed: 0,videoID,userID_1,userID_2,timestamp_1,timestamp_2,time_diff_seconds
2,11,-ilNuSh1Fgw,feNNP607aG1F64jR6bk8jw,CVEf5dB1MvNRTQFYivAIPQ,2018-04-27 22:28:49,2018-04-27 22:29:36,47.0
3,12,-ilNuSh1Fgw,5SDVRa-J-_cWYP6g0WNzLw,jz6hyweGgVHGTw-PbEMqKw,2018-05-14 16:52:08,2018-05-14 16:52:24,16.0
4,13,-ilNuSh1Fgw,42Egn_22OjOzg2XMqAa9_g,poH0yvIGbS5_7MdXM4EuRA,2018-05-14 16:55:04,2018-05-14 16:55:15,11.0
5,14,-ilNuSh1Fgw,42Egn_22OjOzg2XMqAa9_g,MnOZgKSIq_7RKuR6XZqlxA,2018-05-14 16:55:04,2018-05-14 16:55:32,28.0
6,15,-ilNuSh1Fgw,poH0yvIGbS5_7MdXM4EuRA,MnOZgKSIq_7RKuR6XZqlxA,2018-05-14 16:55:15,2018-05-14 16:55:32,17.0


# Network Construction

In [90]:
# Building the graph and showing some metrics
G = build_graph_from_df(df)

# Showing some metrics
metrics = show_graph_metrics(G)

print(f"Number of nodes: {metrics['num_nodes']}")    
print(f"Number of edges: {metrics['num_edges']}")
print(f"Number of components: {metrics['num_components']}")
print(f"Mean degree: {metrics['mean_degree']:.2f}")
print(f"Degree variance: {metrics['var_degree']:.2f}")
print(f"Density: {metrics['density']:.4f}")
print(f"Diameter: {metrics['diameter']}")
print(f"Average clustering coefficient: {metrics['clustering']:.4f}")
print(f"Average path length: {metrics['avg_path_length']:.4f}")

Number of nodes: 4242
Number of edges: 8169
Number of components: 210
Mean degree: 3.85
Degree variance: 1.33
Density: 0.0009
Diameter: nan
Average clustering coefficient: 0.8822
Average path length: nan



<center>
<h3> Full network </h3>
<img src="images/full_network.png" alt="Full network generated with Gephi" width="600" height="600">
</center>

In [91]:
components_df = get_components_dataframe(G)
plot_component_sizes(components_df)

In [None]:
components = get_top_components(G, top=20)
plot_component_metrics(
    components, 
    metrics_list=[
        "mean_degree", 
        "density", 
        "clustering", 
        "avg_path_length", 
        "modularity",
        "assortativity"
        ]
)

In [76]:
# Calculate cumulative sum and percentage
components_df['cumulative_nodes'] = components_df['num_nodes'].cumsum()
total_nodes = components_df['num_nodes'].sum()
components_df['cumulative_percent'] = (components_df['cumulative_nodes'] / total_nodes) * 100

# Find components needed to reach 80% coverage
components_for_95 = components_df[components_df['cumulative_percent'] <= 80].copy()

# Calculate how much of the network these components represent
num_components_95 = len(components_for_95)
percent_nodes_covered = components_for_95['cumulative_percent'].iloc[-1]
percent_edges_covered = (components_for_95['num_edges'].sum() / components_df['num_edges'].sum()) * 100

print(f"To cover 80% of nodes, you need the top {num_components_95} largest components")
print(f"These components contain:")
print(f"- {components_for_95['num_nodes'].sum():,} nodes ({percent_nodes_covered:.2f}% of total)")
print(f"- {components_for_95['num_edges'].sum():,} edges ({percent_edges_covered:.2f}% of total)")

# Show the breakdown of these components
print("\nBreakdown of these components:")
print(components_for_95[['component_id', 'num_nodes', 'num_edges', 'cumulative_percent']])

To cover 80% of nodes, you need the top 27 largest components
These components contain:
- 3,380 nodes (79.68% of total)
- 7,040 edges (86.18% of total)

Breakdown of these components:
    component_id  num_nodes  num_edges  cumulative_percent
0             39       1722       3697           40.594059
1             44        563       1132           53.866101
2            210        127        444           56.859972
3             13        122        199           59.735974
4              2         97        200           62.022631
5            206         60         94           63.437058
6             18         58        131           64.804338
7            170         57         88           66.148043
8             57         55        104           67.444602
9            192         53         86           68.694012
10            42         52        122           69.919849
11            11         44         82           70.957096
12           149         44         80           

# Choice of Network Metrics

| Metric | Description | Importance in Model |
|---------|--------------|----------------|
| Degree Distribution | Measures how many connections each node has | It can gives an idea about if some videos are shared by many users or not |
| Clustering Coefficient | Probability that two neighbors of a node are also connected | It could indicate local coordination or group formation |
| Average Path Length | Average number of steps between any two nodes | Shows how quickly information can spread |
| Assortativity | Correlation between degrees of connected nodes | Reveals whether similar nodes tend to connect |
| Modularity | Measures strength of community structure | Detects clusters or coordinated subgroups |


# Baseline Network Models

## Barabási-Albert Network

In [123]:
G_ba_networks = build_ba_from_components(components)
combined_BA = nx.disjoint_union_all(G_ba_networks)
nx.write_gexf(combined_BA, "G_combined_BA.gexf")

<center>
<h3> Barabási-Albert network </h3>
<img src="images/ba_network.png" alt="Full network generated with Gephi" width="600" height="600">
</center>

In [131]:
plot_component_metrics(
    G_ba_networks, 
    metrics_list=[
        "mean_degree", 
        "density", 
        "clustering", 
        "avg_path_length", 
        "modularity",
        "assortativity"
        ]
)

In [132]:
plot_component_metrics(
    components, 
    metrics_list=[
        "mean_degree", 
        "density", 
        "clustering", 
        "avg_path_length", 
        "modularity",
        "assortativity"
        ]
)

## Model 2

# Evaluation and Interpretation