## Importing Libraries

In [2]:
!pip install networkx

Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: networkx
Successfully installed networkx-3.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [78]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
import numpy as np
import random

In [43]:
df = pd.read_csv("pairwise_52seconds_share.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,videoID,userID_1,userID_2,timestamp_1,timestamp_2,time_diff_seconds
0,0,-6bGXfM8-gs,19372991|840224732847833,19372991|840224732847833,2018-07-22 21:19:58,2018-07-22 21:19:58,0.0
1,10,-fJbMWhkTAw,Ej8Mm0YMadzmx4osDA_hgg,Ej8Mm0YMadzmx4osDA_hgg,2018-08-01 00:51:08,2018-08-01 00:51:08,0.0
2,11,-ilNuSh1Fgw,feNNP607aG1F64jR6bk8jw,CVEf5dB1MvNRTQFYivAIPQ,2018-04-27 22:28:49,2018-04-27 22:29:36,47.0
3,12,-ilNuSh1Fgw,5SDVRa-J-_cWYP6g0WNzLw,jz6hyweGgVHGTw-PbEMqKw,2018-05-14 16:52:08,2018-05-14 16:52:24,16.0
4,13,-ilNuSh1Fgw,42Egn_22OjOzg2XMqAa9_g,poH0yvIGbS5_7MdXM4EuRA,2018-05-14 16:55:04,2018-05-14 16:55:15,11.0


## Getting Metrics

In [48]:
def get_metrics(G):
    # Degree distribution
    degrees = [d for n, d in G.degree()]
    mean_degree = np.mean(degrees)
    var_degree = np.var(degrees)

    # Average clustering coefficient
    clustering = nx.average_clustering(G)

    # Average shortest path length (works only for connected graphs)
    avg_path_length = nx.average_shortest_path_length(G)

    # Assortativity coefficient
    assortativity = nx.degree_assortativity_coefficient(G)

    # Centralization proxy (standard deviation of degrees)
    degree_centralization = np.std(degrees)

    print(f"Mean degree: {mean_degree:.2f}")
    print(f"Degree variance: {var_degree:.2f}")
    print(f"Average clustering coefficient: {clustering:.4f}")
    print(f"Average path length: {avg_path_length:.4f}")
    print(f"Assortativity coefficient: {assortativity:.4f}")
    print(f"Degree centralization (std): {degree_centralization:.2f}")

## Draw Degree Distribution

In [55]:
def draw_degree_distribution(G):
    # Degree distribution
    degrees = [d for n, d in G.degree()]
    unique_degrees, counts = np.unique(degrees, return_counts=True)
    prob = counts / sum(counts)

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=unique_degrees, 
        y=prob,
        mode='markers',
        marker=dict(size=6, color='royalblue'),
        name="Degree Distribution"
    ))

    fig.update_xaxes(type="log", title="Degree (k)")
    fig.update_yaxes(type="log", title="P(k)")
    fig.update_layout(
        title="Degree Distribution (Log-Log)",
        template="plotly_white"
    )

    fig.show()

## Building a graph from dataset

In [50]:
# Getting valid users and videos

# Filter valid user pairs (less or equal than 52 seconds)
filtered = df[df['time_diff_seconds'] <= 52]

# Removing records with same user in user_1 and user_2
filtered = filtered[filtered['userID_1'] != filtered['userID_2']]

# Get all videos with at least one valid pair
valid_videos = filtered['videoID'].unique()

# Get all users involved in those videos
users_involved = pd.unique(filtered[['userID_1', 'userID_2']].values.ravel())

### User - User Graph

In [51]:
# Building the graph with user interaction
G_users = nx.Graph()

# Add user nodes
for user in users_involved:
    G_users.add_node(user, type='user')

# Getting average time difference per pair of users
df_agg = filtered.groupby(['userID_1', 'userID_2']).agg({'time_diff_seconds': 'mean'}).reset_index()

# Add user-user edges
for _, row in df_agg.iterrows():
    G_users.add_edge(
        row['userID_1'],
        row['userID_2'],
        # video=row['videoID'],
        time_diff=row['time_diff_seconds']
    )

print(f"Length of no filtered data: {len(df)}")
print(f"Length of filtered data: {len(filtered)}")
print(f"Length of graph dataset: {len(df_agg)}")


# Writing graph to file
nx.write_gexf(G_users, 'users_graph.gexf')

Length of no filtered data: 4471
Length of filtered data: 4127
Length of graph dataset: 4078


### User - Video Graph

In [52]:
G_users_video = nx.Graph()

# Add user nodes
for user in users_involved:
    G_users_video.add_node(user, type='user')

# Add video nodes
for video in valid_videos:
    G_users_video.add_node(video, type='video')


# Getting user and videos dataframe
user1_video_df = filtered[['videoID', 'userID_1']].drop_duplicates()
user2_video_df = filtered[['videoID', 'userID_2']].drop_duplicates()
user2_video_df.rename(columns={'userID_2': 'userID_1'}, inplace=True)
user_video_df = pd.concat([user1_video_df, user2_video_df]).drop_duplicates()
user_video_df.rename(columns={'userID_1': 'user', 'videoID': 'video'}, inplace=True)

# Adding edges
for _, row in user_video_df.iterrows():
    user = row['user']
    video = row['video']
    G_users_video.add_edge(user, video)


# Writing graph to file
nx.write_gexf(G_users_video, 'users_video_graph.gexf')


### User - User - Video Graph

In [95]:
G_users2_video = nx.Graph()

# Add user nodes
for user in users_involved:
    G_users2_video.add_node(user, type='user')

# Add video nodes
for video in valid_videos:
    G_users2_video.add_node(video, type='video')


# Getting user and videos dataframe
user1_video_df = filtered[['videoID', 'userID_1']].drop_duplicates()
user2_video_df = filtered[['videoID', 'userID_2']].drop_duplicates()
user2_video_df.rename(columns={'userID_2': 'userID_1'}, inplace=True)
user_video_df = pd.concat([user1_video_df, user2_video_df]).drop_duplicates()
user_video_df.rename(columns={'userID_1': 'user', 'videoID': 'video'}, inplace=True)

# Adding edges
for _, row in user_video_df.iterrows():
    user = row['user']
    video = row['video']
    G_users2_video.add_edge(user, video)

# Getting user interaction
user_interactions = filtered[['userID_1', 'userID_2']].drop_duplicates()

# Adding edges
for _, row in user_interactions.iterrows():
    user1 = row['userID_1']
    user2 = row['userID_2']
    G_users2_video.add_edge(user1, user2)

# Writing graph to file
nx.write_gexf(G_users2_video, 'users2_video_graph.gexf')

## Barabási-Albert model

In [67]:

# Generate a Scale-Free (Barabási–Albert) from network (Video - User - User)
n = G_users2_video.number_of_nodes()
avg_degree = G_users2_video.number_of_edges() / n
m = int(round(avg_degree))
G_ba = nx.barabasi_albert_graph(n, m)
get_metrics(G_ba)

# Writing File
nx.write_gexf(G_ba, "G_barabasi_albert.gexf")


Mean degree: 4.00
Degree variance: 34.28
Average clustering coefficient: 0.0081
Average path length: 4.7032
Assortativity coefficient: -0.0524
Degree centralization (std): 5.85


In [None]:
# Drawing degree distribution of BA network
draw_degree_distribution(G_ba)

In [98]:
# Degree distribution of real data
draw_degree_distribution(G_users2_video)

## Real Data Graph Transformation

In [96]:

# Adding connections between users
def connect_user_to_video(G, video, user):
    
    # Getting user neighbors from user node that is type="user"
    user_neighbors = [n for n in G.neighbors(user) if G.nodes[n].get("type") == "user"]

    # Selecting one random user neighbor
    user_neighbor = random.choice(user_neighbors)

    # Validating if users are not connected
    if G.has_edge(user, video):
        return

    if G.has_edge(user_neighbor, video):
        return
    
    # Adding edge between user and video
    G.add_edge(user, video)
    G.add_edge(user_neighbor, video)

# Generating copy of graph
G_transformed = G_users2_video.copy()

# Getting the list of video nodes (type="video")
video_nodes = [n for n, d in G_transformed.nodes(data=True) if d["type"] == "video"]
user_nodes = [n for n, d in G_transformed.nodes(data=True) if d["type"] == "user"]

# Creating dataframe counting neighbors of each video node
video_neighbors = pd.DataFrame(columns=["video", "user_count"])
for video in video_nodes:
    video_neighbors.loc[video] = [video, len(list(G_transformed.neighbors(video)))]

# Reseting index
video_neighbors.reset_index(inplace=True)
video_neighbors = video_neighbors.drop(columns=["index"])

# Getting pair of videos.
connections = 10
for i in range(connections):
    # Getting a video randomly, high changes with high number of user_count
    probabilities = video_neighbors['user_count'] / video_neighbors['user_count'].sum()
    random_index = np.random.choice(video_neighbors.index, p=probabilities)
    selected_video = video_neighbors.iloc[random_index]["video"]

    # Getting a user randomly from the node
    user = random.choice(user_nodes)

    # Validate that video is not connected to user
    if not G_transformed.has_edge(user, selected_video):
        connect_user_to_video(G_transformed, selected_video, user)


    

In [97]:
draw_degree_distribution(G_transformed)

In [100]:
# Removing all the components except the largest one

# Get all connected components sorted by size (largest first)
connected_components = sorted(nx.connected_components(G_transformed), key=len, reverse=True)

# The first component is the largest
largest_component = connected_components[0]

# Create a new graph with only the largest component
G_largest = G_transformed.subgraph(largest_component).copy()

# Verify
print(f"Original graph nodes: {len(G_transformed)}")
print(f"Largest component nodes: {len(G_largest)}")
print(f"Is connected: {nx.is_connected(G_largest)}")

Original graph nodes: 4242
Largest component nodes: 2584
Is connected: True


In [101]:
get_metrics(G_largest)

Mean degree: 4.22
Degree variance: 1269.51
Average clustering coefficient: 0.8545
Average path length: 4.5530
Assortativity coefficient: -0.2327
Degree centralization (std): 35.63
