<a href="https://colab.research.google.com/github/unaisali7/community_detection/blob/main/community.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from sklearn.cluster import KMeans
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px

# Load data
df = pd.read_csv('/content/filtered_users_multiple_videos (1).csv')

# Preprocess comment text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()
    return text


df = df[df['Comment'].notna() & (df['Comment'].str.strip() != '')]
df['Comment'] = df['Comment'].apply(clean_text)

# Create embeddings from comments using SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
comment_embeddings = model.encode(df['Comment'].tolist(), show_progress_bar=True)

# Build the graph using user embeddings
num_users = len(df['User'].unique())
user_mapping = {user: idx for idx, user in enumerate(df['User'].unique())}
user_indices = df['User'].map(user_mapping)

# Compute similarity between embeddings (cosine similarity as an edge weight heuristic)
similarity_matrix = cosine_similarity(comment_embeddings)

# Create edges based on similarity threshold
threshold = 0.5  # Set a similarity threshold
edge_indices = []
edge_weights = []
for i in range(num_users):
    for j in range(i + 1, num_users):
        if similarity_matrix[i, j] >= threshold:
            edge_indices.append((i, j))
            edge_weights.append(similarity_matrix[i, j])

# Define GraphSAGE using PyTorch Geometric
# Create edge_index for PyTorch Geometric
edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

# Node features correspond to embeddings
x = torch.tensor(comment_embeddings, dtype=torch.float)

# Create graph data for GraphSAGE
data = Data(x=x, edge_index=edge_index)

# Define GraphSAGE model using SAGEConv
class GraphSAGEModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphSAGEModel, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Initialize the model
input_dim = comment_embeddings.shape[1]
hidden_dim = 64
output_dim = 5  # Number of communities we want to cluster into
model = GraphSAGEModel(input_dim, hidden_dim, output_dim)

# Forward pass through GraphSAGE
output_embeddings = model(x, edge_index)

# Cluster embeddings using KMeans
kmeans = KMeans(n_clusters=output_dim, random_state=42)
cluster_labels = kmeans.fit_predict(output_embeddings.detach().numpy())

# Create NetworkX graph for interactive visualization
G = nx.Graph()

# Add nodes and cluster information to the graph
for idx, user in enumerate(df['User'].unique()):
    G.add_node(user, cluster=cluster_labels[idx])

# Add edges to the graph
for edge in edge_indices:
    G.add_edge(df['User'].iloc[edge[0]], df['User'].iloc[edge[1]])

# Prepare positions for nodes in the graph (using spring layout)
pos = nx.spring_layout(G, seed=42)

# Extract node positions and cluster info
x_pos = [pos[node][0] for node in G.nodes()]
y_pos = [pos[node][1] for node in G.nodes()]
node_colors = [G.nodes[n]['cluster'] for n in G.nodes()]

# Create Plotly figure
edge_x = []
edge_y = []
for (u, v) in G.edges():
    x0, y0 = pos[u]
    x1, y1 = pos[v]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_y.append(y0)
    edge_y.append(y1)

# Plot edges
edges_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

# Plot nodes
nodes_trace = go.Scatter(
    x=x_pos, y=y_pos,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        size=10,
        colorbar=dict(
            thickness=15,
            title='Community',
            xanchor='left',
            titleside='right'
        )
    )
)

# Create text labels for hover (display user info)
node_text = [f'User: {df["User"].iloc[i]}' for i in range(len(df['User'].unique()))]
nodes_trace.marker.color = node_colors
nodes_trace.text = node_text

# Create figure with nodes and edges
fig = go.Figure(data=[edges_trace, nodes_trace],
                layout=go.Layout(
                    title="Interactive Community Graph with GraphSAGE",
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False),
                    plot_bgcolor='rgb(255, 255, 255)',
                    width=800, height=800
                ))

fig.show()


Batches:   0%|          | 0/81 [00:00<?, ?it/s]

In [None]:
fig.show()
