In [11]:
!pip install igraph
import sqlite3
import networkx as nx

# --- Database Connection and Data Retrieval ---

db_path = "social_network_anonymized.db"  # Replace with your actual path

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Fetch Profiles (all, for later use)
cursor.execute("SELECT id, name, profile_type, profile_url FROM Profiles")
all_profiles = [
    {"id": row[0], "name": row[1], "profile_type": row[2], "profile_url": row[3]}
    for row in cursor.fetchall()
]

# Fetch Profile Connections (focus on relevant connection types)
cursor.execute("""
    SELECT source_id, target_id, connection_type
    FROM ProfileConnection
    WHERE connection_type IN ('BECAME_MEMBER_OF_GROUP_ON_FACEBOOK', 'INTERACTED_IN_THE_CONTEXT_OF_ON_FACEBOOK')
""")
profile_connections = [
    {"source_id": row[0], "target_id": row[1], "connection_type": row[2]}
    for row in cursor.fetchall()
]

conn.close()  # Close the database connection

print(f"Fetched {len(all_profiles)} profiles and {len(profile_connections)} connections.")

Collecting igraph
  Downloading igraph-0.11.8-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting texttable>=1.6.2 (from igraph)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading igraph-0.11.8-cp39-abi3-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph
Successfully installed igraph-0.11.8 texttable-1.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Fetched 25461 profiles and 2505 connections.


In [3]:
!pip install igraph
import sqlite3
import igraph as ig

# --- Database Connection ---
db_path = "social_network_anonymized.db"  # !!! Replace with your actual path !!!
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# --- Data Retrieval ---

# 1. Fetch Profiles
cursor.execute("SELECT id, name, profile_type FROM Profiles")
profiles = [
    {"id": row[0], "name": row[1], "type": row[2], "node_type": "profile"}
    for row in cursor.fetchall()
]

# 2. Profile ID to Vertex Index Mapping
profile_id_to_index = {profile["id"]: i for i, profile in enumerate(profiles)}

# 3. Fetch Activities
cursor.execute("SELECT id, type, content, description, timestamp FROM Activity")
activities = [
    {"id": row[0], "type": row[1], "content": row[2], "description": row[3], "timestamp": row[4], "node_type": "activity"}
    for row in cursor.fetchall()
]

# 4. Activity ID to Vertex Index Mapping
activity_id_to_index = {activity["id"]: i + len(profiles) for i, activity in enumerate(activities)}  # Offset

# 5. Fetch Profile-Activity Relationships (handling 'creator')
#  AND get timestamp directly in this query!
cursor.execute("""
    SELECT pa.profile_id, pa.activity_id, pa.relationship_type, a.timestamp
    FROM ProfileActivity pa
    JOIN Activity a ON pa.activity_id = a.id
""")  # Much more efficient!
profile_activity_edges = []
for row in cursor.fetchall():
    profile_id, activity_id, rel_type, timestamp = row  # Unpack all four values

    if profile_id not in profile_id_to_index:
        continue
    if activity_id not in activity_id_to_index:
        continue

    if rel_type == "source" or rel_type == "creator":
        # Profile -> Activity  (add timestamp here)
        profile_activity_edges.append(((profile_id_to_index[profile_id], activity_id_to_index[activity_id]), timestamp))
    elif rel_type == "target":
        # Activity -> Profile (add timestamp here)
        profile_activity_edges.append(((activity_id_to_index[activity_id], profile_id_to_index[profile_id]), timestamp))
    # Ignore other relationship types


# 6. Fetch Profile Connections
cursor.execute("SELECT source_id, target_id, connection_type FROM ProfileConnection")
profile_connection_edges = []
for row in cursor.fetchall():
    source_id = row[0]
    target_id = row[1]

    if source_id not in profile_id_to_index:
        continue
    if target_id not in profile_id_to_index:
        continue

    # Profile -> Profile (no timestamp here, unless your ProfileConnection table *has* a timestamp)
    profile_connection_edges.append(((profile_id_to_index[source_id], profile_id_to_index[target_id]), None))  # Add None for consistency

conn.close()

print(f"Fetched {len(profiles)} profiles, {len(activities)} activities.")
print(f"Prepared {len(profile_activity_edges)} profile-activity edges and {len(profile_connection_edges)} profile-connection edges.")

# --- Build the igraph Graph ---

# No longer combine edges *before* adding them.  We need to add the timestamp to each edge.
graph = ig.Graph(directed=True)
graph.add_vertices(len(profiles) + len(activities))

# Set Vertex Attributes (Profiles)
for profile in profiles:
    vertex_index = profile_id_to_index[profile["id"]]
    graph.vs[vertex_index]["name"] = profile["name"]
    graph.vs[vertex_index]["type"] = profile["type"]
    graph.vs[vertex_index]["node_type"] = profile["node_type"]
    graph.vs[vertex_index]["original_id"] = profile["id"]

# Set Vertex Attributes (Activities)
for activity in activities:
    vertex_index = activity_id_to_index[activity["id"]]
    graph.vs[vertex_index]["name"] = f"Activity_{activity['id']}"
    graph.vs[vertex_index]["type"] = activity["type"]
    graph.vs[vertex_index]["node_type"] = activity["node_type"]
    graph.vs[vertex_index]["original_id"] = activity["id"]

# Add edges *with* timestamps
for edge, timestamp in profile_activity_edges:
    graph.add_edge(edge[0], edge[1], timestamp=timestamp)

for edge, timestamp in profile_connection_edges:  # 'timestamp' will often be None here.
    graph.add_edge(edge[0], edge[1], timestamp=timestamp)  #Add timestamps here.


# --- Remove Isolated Nodes ---

# Get the indices of vertices with degree 0
isolated_vertices = [v.index for v in graph.vs if v.degree() == 0]

# Delete those vertices
graph.delete_vertices(isolated_vertices)

print(f"Removed {len(isolated_vertices)} isolated vertices.")
print(f"Graph now has {graph.vcount()} vertices and {graph.ecount()} edges.")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Fetched 25461 profiles, 102738 activities.
Prepared 83584 profile-activity edges and 43380 profile-connection edges.
Removed 45957 isolated vertices.
Graph now has 82242 vertices and 126964 edges.


In [None]:
BAD_PROFILE_IDS = [23381, 23387, 23412, 23539]

In [6]:
bad_profile_ids = [23381, 23387, 23412, 23539]
training_cutoff_timestamp = 1714533722000  #  This is when 75% of activity has been made
'''WITH OrderedActivity AS (
    SELECT timestamp, 
           ROW_NUMBER() OVER (ORDER BY timestamp) AS row_num,
           COUNT(*) OVER () AS total_rows
    FROM Activity
)
SELECT timestamp 
FROM OrderedActivity
WHERE row_num = CEIL(total_rows * 0.75)  -- 75th percentile row
LIMIT 1;'''
# into a reasonable training/testing period.

# --- 1. Create Training and Testing Subgraphs ---

training_edges = [edge.index for edge in graph.es if edge["timestamp"] is not None and edge["timestamp"] <= training_cutoff_timestamp]
testing_edges = [edge.index for edge in graph.es if edge["timestamp"] is not None and edge["timestamp"] > training_cutoff_timestamp]

training_graph = graph.subgraph_edges(training_edges, delete_vertices=False)
testing_graph = graph.subgraph_edges(testing_edges, delete_vertices=False)

# Remove isolated nodes in the training graph.
training_graph.delete_vertices([v.index for v in training_graph.vs if v.degree() == 0])


# --- 2. Feature Engineering (Training Period) ---
#Find min and max of timestamps
min_timestamp = float('inf')
max_timestamp = float('-inf')
for edge in training_graph.es:
  if 'timestamp' in edge.attributes():
    timestamp = edge['timestamp']
    if timestamp < min_timestamp:
      min_timestamp = timestamp
    if timestamp > max_timestamp:
      max_timestamp = timestamp
print(f"min: {min_timestamp}, max: {max_timestamp}")
features = []
profile_ids_list = []

#Iterate through profiles in the training graph.
for profile_vertex in training_graph.vs:
    if profile_vertex["node_type"] == "profile" and profile_vertex["original_id"] not in bad_profile_ids: #check not bad

        profile_id = profile_vertex["original_id"]
        profile_ids_list.append(profile_id) #add profile ids

        # --- Network Features ---
        degree = training_graph.degree(profile_vertex.index)
        in_degree = training_graph.indegree(profile_vertex.index)
        out_degree = training_graph.outdegree(profile_vertex.index)
        clustering_coefficient = training_graph.transitivity_local_undirected(profile_vertex.index, mode="zero") # Handles 0 division
        #pagerank = training_graph.pagerank(profile_vertex.index) # PageRank
        #betweenness = training_graph.betweenness(profile_vertex.index) # Betweeness
        #Get good neighbours
        good_neighbors = 0 # Number of neighboring profiles that are not "bad"
        for neighbor_index in training_graph.neighbors(profile_vertex.index):
            if training_graph.vs[neighbor_index]["node_type"] == "profile" and training_graph.vs[neighbor_index]["original_id"] not in bad_profile_ids:
                good_neighbors +=1

        # --- Activity Features ---
        num_activities = 0
        for edge in training_graph.es:
            if edge.source == profile_vertex.index or edge.target == profile_vertex.index:
                num_activities +=1

        # Example: Activity rate (activities per day).
        #  Handle cases with very short time spans (avoid division by zero).
        time_span = max_timestamp - min_timestamp
        if time_span > 0:
            activity_rate = num_activities / (time_span/ (60*60*24) ) #per a day
        else:
            activity_rate = 0  # Or some other default value.

        features.append([
            profile_id,
            degree,
            in_degree,
            out_degree,
            clustering_coefficient,
            #pagerank,
            #betweenness,
            good_neighbors,
            num_activities,
            activity_rate,
        ])

# --- Create a Pandas DataFrame for easier analysis ---

feature_names = [
    "profile_id",
    "degree",
    "in_degree",
    "out_degree",
    "clustering_coefficient",
    #"pagerank",
    #"betweenness",
    "good_neighbors",
    "num_activities",
    "activity_rate"
]


min: -3600000, max: 1714533722000


In [None]:
!pip install pandas
import pandas as pd
features_df = pd.DataFrame(features, columns=feature_names)

# --- 3. Target Variable (Testing Period) ---

# Create a set for efficient lookup
bad_profile_ids_set = set(bad_profile_ids)
targets = []

# Check interaction in test graph
for profile_id in profile_ids_list: #Loop through training profiles ids
    interacted_with_bad = 0 # Target Variable
    for edge in testing_graph.es:
        source_vertex_index = edge.source
        target_vertex_index = edge.target
        # Check if source/target is the current profile
        if testing_graph.vs[source_vertex_index]["node_type"] == "profile" and testing_graph.vs[source_vertex_index]["original_id"] == profile_id:
            #Check if the interacting node is "bad", and of the right type.
            if testing_graph.vs[target_vertex_index]["node_type"] == "profile" and testing_graph.vs[target_vertex_index]["original_id"] in bad_profile_ids_set:
                interacted_with_bad = 1
                break #No need
        elif testing_graph.vs[target_vertex_index]["node_type"] == "profile" and testing_graph.vs[target_vertex_index]["original_id"] == profile_id:

         if testing_graph.vs[source_vertex_index]["node_type"] == "profile" and testing_graph.vs[source_vertex_index]["original_id"] in bad_profile_ids_set:
                interacted_with_bad = 1
                break
    targets.append(interacted_with_bad)

# --- 4. Combine Features and Target ---
features_df["interacted_with_bad"] = targets # Add target variable

# --- 5. Analysis and Correlation (Example) ---
print("\n--- Feature Statistics ---")
print(features_df.describe()) #summary stats

print("\n--- Correlation with Target Variable ---")
print(features_df.corr()["interacted_with_bad"].sort_values(ascending=False)) #correlation

# --- 6. Further Analysis and Visualization ---

import matplotlib.pyplot as plt #import
import seaborn as sns #import

# Example: Box plots of features, separated by whether they interacted with a bad profile or not.
for feature_name in feature_names:
     if feature_name != 'profile_id':
        plt.figure(figsize=(8, 6))
        sns.boxplot(x="interacted_with_bad", y=feature_name, data=features_df)
        plt.title(f"{feature_name} vs. Interaction with Bad Profiles")
        plt.show()

#Scatter plot between good neighbours, and activity rate.
plt.figure()
sns.scatterplot(x = "good_neighbors", y = "activity_rate", hue = "interacted_with_bad", data = features_df)
plt.show()
# --- (Optional) Model Training ---
# (You could use scikit-learn here to train a predictive model)
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report
#
# # Split data (if you want to build a model - not strictly necessary for initial exploration)
# X = features_df.drop(["profile_id", "interacted_with_bad"],axis = 1) # X = features
# y = features_df["interacted_with_bad"] # y = target
# if len(X) > 0: # Make Sure that you have enough data to train model.
  # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # # Train a model (example: Logistic Regression)
  # model = LogisticRegression()
  # model.fit(X_train, y_train)

  #  # Make predictions
  # y_pred = model.predict(X_test)

  #  # Evaluate
  # print("\n--- Model Evaluation ---")
  # print("Accuracy:", accuracy_score(y_test, y_pred))
  # print(classification_report(y_test, y_pred)) # Gives precision, recall, F1-score.


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
# --- 1. Ego Network ---
def plot_ego_network(graph, center_node_original_id, radius=2):
    """Plots the ego network of a given node.

    Args:
        graph: The igraph Graph object.
        center_node_original_id: The original ID of the center node.
        radius: The radius (distance from the center node) to include.
    """
    center_vertex_index = None
    for v in graph.vs:
         if v["node_type"] == "profile" and v["original_id"] == center_node_original_id:
            center_vertex_index = v.index
            break  # Important: Exit the loop once found


    if center_vertex_index is None:
        print(f"Error: Could not find node with original ID {center_node_original_id}")
        return

    # Get the ego network (nodes within 'radius' distance)
    ego_network_vertices = graph.neighborhood(center_vertex_index, order=radius)

    # Create the induced subgraph
    subgraph = graph.subgraph(ego_network_vertices)

      # Choose a layout (kk is often good for smaller graphs)
    layout = subgraph.layout("kk")

    # Basic visual styles (customize as needed)
    visual_style = {
        "vertex_label": subgraph.vs["name"],
        "vertex_size": 20,
         "vertex_color": ["skyblue" if n["node_type"] == "profile" else "salmon" for n in subgraph.vs],
        "layout": layout,
        "bbox": (600, 600),
        "margin": 50,
    }

    ig.plot(subgraph, **visual_style)

# Example usage:
plot_ego_network(graph, center_node_original_id=27059, radius=2)



# --- 2. Induced Subgraph from Community Detection ---
def plot_community_subgraph(graph, community_index):
    """Plots the subgraph corresponding to a specific community.

    Args:
        graph: The igraph Graph object.
        community_index: The index of the community to plot (0-based).
    """

    # Run community detection (Louvain) - only run it *once* on the full graph
    communities = graph.community_multilevel()  # Louvain
    # communities = graph.community_leiden()  # Leiden (another good algorithm)

    if community_index < 0 or community_index >= len(communities):
        print(f"Error: Invalid community index {community_index}.  Must be between 0 and {len(communities) - 1}")
        return


    # Get the vertices belonging to the specified community
    community_vertices = communities[community_index]


    # Create an induced subgraph
    subgraph = graph.subgraph(community_vertices)

    # Layout and visual styles (customize as needed!)
    layout = subgraph.layout("kk")  # Or "fr"
    visual_style = {
        "vertex_label": subgraph.vs["name"],
        "vertex_size": 20,
        "vertex_color": ["skyblue" if n["node_type"] == "profile" else "salmon" for n in subgraph.vs],
        "layout": layout,
         "bbox": (600, 600),
        "margin": 20,
    }
    ig.plot(subgraph, **visual_style)

# Example usage:
plot_community_subgraph(graph, community_index=0)  # Plot community 0
# plot_community_subgraph(graph, community_index=2)  # Plot community 2


def edge_sample(graph, num_edges):
  """Plot sample edges of graph.
  Args:
        graph: The igraph Graph object.
        community_index: The index of the community to plot (0-based).
  """
  random_edges = random.sample(graph.get_edgelist(), num_edges)
  subgraph = graph.subgraph_edges(random_edges)

      # Layout and visual styles (customize as needed!)
  layout = subgraph.layout("kk")  # Or "fr"
  visual_style = {
      "vertex_label": subgraph.vs["name"],
      "vertex_size": 20,
      "vertex_color": ["skyblue" if n["node_type"] == "profile" else "salmon" for n in subgraph.vs],
      "layout": layout,
        "bbox": (600, 600),
      "margin": 20,
  }
  ig.plot(subgraph, **visual_style)
#Example usage:
print('\nPLOTTING EDGE SAMPLE\n')
edge_sample(graph, 300)


# # --- 3. Filtering by Degree/Centrality (Example with Degree) ---

def plot_high_degree_subgraph(graph, top_n=50):
    """Plots a subgraph of the top 'n' nodes with the highest degree."""

    # Calculate degrees
    degrees = graph.degree()

    # Get the indices of the top 'n' nodes
    top_node_indices = sorted(range(len(degrees)), key=lambda i: degrees[i], reverse=True)[:top_n]

    # Create the induced subgraph
    subgraph = graph.subgraph(top_node_indices)

    # Layout and visual styles
    layout = subgraph.layout("kk")
    visual_style = {
        "vertex_label": subgraph.vs["name"],
        "vertex_size": [d * 2 for d in subgraph.degree()],  # Size proportional to degree
        "vertex_color": ["skyblue" if n["node_type"] == "profile" else "salmon" for n in subgraph.vs],
        "layout": layout,
        "bbox": (600, 600),
        "margin": 20,
    }
    ig.plot(subgraph, **visual_style)


print('\nPLOTTING HIGH DEGREE SUBGRAPH\n')
plot_high_degree_subgraph(graph, top_n=50)



# --- 4. Random Subsampling ---

def plot_random_subgraph(graph, num_nodes=100):
    """Plots a random induced subgraph with a specified number of nodes."""

    if num_nodes > graph.vcount():
        print("Error:  Cannot sample more nodes than exist in the graph.")
        return

    # Randomly select node indices
    sampled_node_indices = random.sample(range(graph.vcount()), num_nodes)

    # Create the induced subgraph
    subgraph = graph.subgraph(sampled_node_indices)
     # Layout and visual styles
    layout = subgraph.layout("kk")  # Or "fr"
    visual_style = {
        "vertex_label": subgraph.vs["name"],
        "vertex_size": 20,
        "vertex_color": ["skyblue" if n["node_type"] == "profile" else "salmon" for n in subgraph.vs],
        "layout": layout,
        "bbox": (600, 600),
        "margin": 20,
    }
    ig.plot(subgraph, **visual_style)

print('\nPLOTTING RANDOM SUBGRAPH\n')
plot_random_subgraph(graph, num_nodes=100)

KeyboardInterrupt: 