# This file attempts to perform Social Network Analysis on the ENTIRE dataset, as opposed to partial dataset in v1.

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import os
import random

In [None]:
file_path = "soc-Slashdot0902.txt"
subset_percentage = 40
data = []

In [None]:
with open(file_path, 'r') as f:
    # Skip the first 4 lines (header)
    for _ in range(4):
        next(f)

    # Read and process lines, selecting a percentage randomly
    for line in f:
        if random.random() < subset_percentage:
            source, target = map(int, line.strip().split("\t"))
            data.append({"FromNodeId": source, "ToNodeId": target})

## Preprocessing and creation of a NetworkX directed graph

In [None]:
# Convert the list of dictionaries to a pandas DataFrame
data = pd.DataFrame(data)

# Display the first few rows
print(data.head())

In [None]:
# Create a directed graph from the DataFrame
G = nx.from_pandas_edgelist(data, source="FromNodeId", target="ToNodeId", create_using=nx.DiGraph())

## Some Exploratory Data Analysis (EDA) on the graph

In [None]:
# Basic EDA
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [None]:
# Degree distributions
in_degrees = [d for n, d in G.in_degree()]
out_degrees = [d for n, d in G.out_degree()]

In [None]:
print(f"Average in-degree: {np.mean(in_degrees):.2f}")
print(f"Average out-degree: {np.mean(out_degrees):.2f}")

In [None]:
# Plot degree distributions
plt.hist(in_degrees, bins=50, log=True)
plt.title("In-degree Distribution")
plt.xlabel("In-degree")
plt.ylabel("Frequency")
plt.savefig("in_degree_distribution.png", dpi=300,bbox_inches='tight')
plt.show()

In [None]:
plt.hist(out_degrees, bins=50, log=True)
plt.title("Out-degree Distribution")
plt.xlabel("Out-degree")
plt.ylabel("Frequency")
plt.savefig('out_degree_distribution.png', dpi=300,bbox_inches='tight')
plt.show()

## Detection of communities using the Louvain method

In [None]:
import community as community_louvain
from community import best_partition

In [None]:
undirected_G = G.to_undirected()
partition = community_louvain.best_partition(undirected_G)

In [None]:
# Add community labels as node attributes
nx.set_node_attributes(G, partition, "community")

## Computing centrality measures for the nodes in the graph

In [None]:
degree_centrality = nx.degree_centrality(G)

Disclaimer: the following block of code could get compute intensive, execute with vaution

In [None]:
betweenness_centrality = nx.betweenness_centrality(G)

In [None]:
closeness_centrality = nx.closeness_centrality(G)

In [None]:
eigenvector_centrality = nx.eigenvector_centrality(G)

In [None]:
# Add centrality measures as node attributes
nx.set_node_attributes(G, degree_centrality, "degree_centrality")
nx.set_node_attributes(G, betweenness_centrality, "betweenness_centrality")
nx.set_node_attributes(G, closeness_centrality, "closeness_centrality")
nx.set_node_attributes(G, eigenvector_centrality, "eigenvector_centrality")

In [None]:
# Visualize the graph
pos = nx.spring_layout(G, seed=42)
node_color = [partition[node] for node in G.nodes()]


## General Visualization of the graph

In [None]:

plt.figure(figsize=(10, 10))
nx.draw(G, pos, node_size=50, node_color=node_color, with_labels=False, alpha=0.5)
plt.title("General Visualization of the graph")
plt.savefig('v-General Visualization of the graph.png', dpi=400, bbox_inches='tight')
plt.show()


## Basic network visualization with node labels

In [None]:
nx.draw(G, pos, node_size=20, node_color=node_color, with_labels=True, alpha=0.5)

## Network visualization with different node sizes based on degree centrality

In [None]:
node_size = [degree_centrality[node] * 5000 for node in G.nodes()]
nx.draw(G, pos, node_size=node_size, node_color=node_color, with_labels=False, alpha=0.5)

## Network visualization with different node colors based on betweenness centrality

In [None]:
node_color = [betweenness_centrality[node] for node in G.nodes()]
nx.draw(G, pos, node_size=20, node_color=node_color, cmap=plt.cm.Blues, with_labels=False, alpha=0.5)
plt.savefig('v-Network visualization with different node colors based on betweenness centrality.png', dpi=400, bbox_inches='tight')

## Network visualization with different node sizes based on eigenvector centrality

In [None]:
node_size = [eigenvector_centrality[node] * 5000 for node in G.nodes()]
nx.draw(G, pos, node_size=node_size, node_color=node_color, with_labels=False, alpha=0.5)

## Network visualization with different node colors based on closeness centrality

In [None]:
node_color = [closeness_centrality[node] for node in G.nodes()]
nx.draw(G, pos, node_size=20, node_color=node_color, cmap=plt.cm.Reds, with_labels=False, alpha=0.5)

## Network visualization with circular layout

In [None]:
circular_pos = nx.circular_layout(G)
nx.draw(G, circular_pos, node_size=20, node_color=node_color, with_labels=False, alpha=0.5)

## Network visualization with shell layout

In [None]:
shell_pos = nx.shell_layout(G)
nx.draw(G, shell_pos, node_size=20, node_color=node_color, with_labels=False, alpha=0.5)

## Network visualization with spectral layout

In [None]:
spectral_pos = nx.spectral_layout(G)
nx.draw(G, spectral_pos, node_size=20, node_color=node_color, with_labels=False, alpha=0.5)
plt.savefig('v-Network visualization with spectral layout.png', dpi=400, bbox_inches='tight')

## Histogram of degree centrality values

In [None]:
plt.hist(list(degree_centrality.values()), bins=50, log=True)
plt.title("Degree Centrality Distribution")
plt.xlabel("Degree Centrality")
plt.ylabel("Frequency")
plt.savefig('v-Degree Centrality Distribution.png', dpi=400, bbox_inches='tight')
plt.show()

## Histogram of betweenness centrality values

In [None]:
plt.hist(list(betweenness_centrality.values()), bins=50, log=True)
plt.title("Betweenness Centrality Distribution")
plt.xlabel("Betweenness Centrality")
plt.ylabel("Frequency")
plt.savefig('v-Betweenness Centrality Distribution.png', dpi=400, bbox_inches='tight')
plt.show()

## Histogram of closeness centrality values

In [None]:
plt.hist(list(closeness_centrality.values()), bins=50, log=True)
plt.title("Closeness Centrality Distribution")
plt.xlabel("Closeness Centrality")
plt.ylabel("Frequency")
plt.savefig('v-Closeness Centrality Distribution.png', dpi=400, bbox_inches='tight')
plt.show()

## displays findings

In [None]:
# Display the number of nodes and edges in the graph
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [None]:
# Display the average in-degree and out-degree
print(f"Average in-degree: {np.mean(in_degrees):.2f}")
print(f"Average out-degree: {np.mean(out_degrees):.2f}")

In [None]:
# Calculate and display the number of communities detected by the Louvain method
num_communities = len(set(partition.values()))
print(f"Number of communities detected: {num_communities}")

In [None]:
# Display the top N nodes for each centrality measure
N = 10
sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
sorted_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
sorted_closeness_centrality = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)
sorted_eigenvector_centrality = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)

In [None]:
print(f"\nTop {N} nodes by degree centrality:")
for node, value in sorted_degree_centrality[:N]:
    print(f"Node {node}: {value:.4f}")

print(f"\nTop {N} nodes by betweenness centrality:")
for node, value in sorted_betweenness_centrality[:N]:
    print(f"Node {node}: {value:.4f}")

print(f"\nTop {N} nodes by closeness centrality:")
for node, value in sorted_closeness_centrality[:N]:
    print(f"Node {node}: {value:.4f}")

print(f"\nTop {N} nodes by eigenvector centrality:")
for node, value in sorted_eigenvector_centrality[:N]:
    print(f"Node {node}: {value:.4f}")