In [1]:
# --- Cell 1: Install Required Packages ---
import sys
import subprocess
import importlib

def install_package(package):
    """Installs a package using pip, handling potential errors."""
    try:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ Successfully installed {package}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {package}: {e}")
        return False
    except Exception as e:
        print(f"⚠️ Unexpected error installing {package}: {e}")
        return False

required_packages = [
    "python-arango",
    "networkx>=3.0",
    "pandas",
    "tqdm",
    "langchain-cohere",
    "langchain-community",
    "langchain",
    "matplotlib",
    "python-louvain",
    "requests",
    "httpx-sse",
    "openai",
    "gradio"  # Ensure Gradio is included
]

all_packages_installed = True
for package in required_packages:
    try:
        importlib.import_module(package.replace("-", "_"))
        print(f"✅ {package} is already installed")
    except ImportError:
        if not install_package(package):
            all_packages_installed = False

if not all_packages_installed:
    print("ERROR: Some packages failed to install.")

Installing python-arango...
✅ Successfully installed python-arango
Installing networkx>=3.0...
✅ Successfully installed networkx>=3.0
✅ pandas is already installed
✅ tqdm is already installed
✅ langchain-cohere is already installed
✅ langchain-community is already installed
✅ langchain is already installed
✅ matplotlib is already installed
Installing python-louvain...
✅ Successfully installed python-louvain
✅ requests is already installed
✅ httpx-sse is already installed
✅ openai is already installed


  from .autonotebook import tqdm as notebook_tqdm


✅ gradio is already installed


In [2]:
# --- Cell 2: Import Necessary Libraries and Configuration ---
import networkx as nx
import nx_arangodb as nxadb
from arango import ArangoClient
from langchain_cohere import ChatCohere
import pandas as pd
import gzip
import time
import os
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import json
import gradio as gr
import tempfile
import warnings

# API Keys (hardcoded for submission as per your requirement)
ARANGODB_HOST = "https://a40b6d186a3a.arangodb.cloud:8529"
ARANGODB_USER = "root"
ARANGODB_PASS = "2eM5Wd4NRTrcnHQt3yfM"
COHERE_API_KEY = "WcXnR3lxNWGwnoJmI2hq8CnCmPfAr8fRbFFacCsT"

# Suppress nx_arangodb warning
warnings.filterwarnings("ignore", category=UserWarning, module="nx_arangodb")

%matplotlib inline

--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy, cupy-cuda12x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------

[12:27:33 +0530] [INFO]: NetworkX-cuGraph is available.


In [3]:
# --- Cell 3: ArangoDB Connection Function ---
from typing import Optional

def connect_to_arangodb(
    host: str = ARANGODB_HOST,
    username: str = ARANGODB_USER,
    password: str = ARANGODB_PASS,
    database: str = "amazon_copurchase",
) -> Optional[object]:
    """Connect to ArangoDB and return database handle."""
    try:
        client = ArangoClient(hosts=host)
        sys_db = client.db("_system", username=username, password=password)
        if not sys_db.has_database(database):
            sys_db.create_database(database)
            print(f"Created database: {database}")
        db = client.db(database, username=username, password=password)
        print(f"✅ Connected to ArangoDB database: {database}")
        return db
    except Exception as e:
        print(f"⚠️ Error connecting to ArangoDB: {e}")
        return None

db = connect_to_arangodb()

✅ Connected to ArangoDB database: amazon_copurchase


In [4]:
# --- Cell 4: cuGraph Availability Check ---
try:
    import cugraph
    print("✅ cugraph is available")
    use_cugraph = True
except ImportError:
    print("⚠️ cugraph is not available. Using NetworkX for all graph operations.")
    use_cugraph = False

⚠️ cugraph is not available. Using NetworkX for all graph operations.


In [5]:
# --- Cell 5: Dataset Download and Parsing ---
import requests

def download_file(url, filename):
    """Download a file with progress bar"""
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total_size = int(r.headers.get('content-length', 0))
        with open(filename, 'wb') as f:
            with tqdm(total=total_size, unit='B', unit_scale=True, unit_divisor=1024, desc=filename) as pbar:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        pbar.update(len(chunk))
    print(f"✅ Downloaded {filename}")

def parse_amazon_copurchase(gz_file, sample_size=100000):
    """Parse Amazon co-purchasing network from gzipped file"""
    print(f"Parsing co-purchase network from {gz_file}...")
    edges = []
    try:
        with gzip.open(gz_file, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(tqdm(f, desc="Reading edges", unit=" lines")):
                if not line.startswith('#'):
                    parts = line.strip().split()
                    if len(parts) >= 2:
                        source, target = parts[0], parts[1]
                        edges.append((source, target))
                    if i >= sample_size:
                        break
    except Exception as e:
        print(f"Error parsing {gz_file}: {e}")
        return pd.DataFrame()
    
    print(f"✅ Parsed {len(edges)} co-purchase edges")
    if not edges:
        return pd.DataFrame()
    
    df = pd.DataFrame(edges, columns=['source', 'target'])
    df['source'] = pd.to_numeric(df['source'], errors='coerce').astype(int)
    df['target'] = pd.to_numeric(df['target'], errors='coerce').astype(int)
    df = df.dropna()
    csv_filename = gz_file.replace('.gz', '.csv')
    df.to_csv(csv_filename, index=False)
    print(f"✅ Saved to {csv_filename}")
    return df

# Amazon datasets
amazon_datasets = {
    "amazon0302": "http://snap.stanford.edu/data/amazon0302.txt.gz",
    "amazon0312": "http://snap.stanford.edu/data/amazon0312.txt.gz",
    "amazon0505": "http://snap.stanford.edu/data/amazon0505.txt.gz",
    "amazon0601": "http://snap.stanford.edu/data/amazon0601.txt.gz",
}

In [6]:
# --- Cell 6: Load and Prepare Graphs ---
amazon_graphs = {}

def load_graphs():
    if not os.path.exists("amazon_data"):
        os.makedirs("amazon_data")
    
    for dataset_name, url in amazon_datasets.items():
        filename = os.path.join("amazon_data", url.split('/')[-1])
        csv_filename = filename.replace('.gz', '.csv')
        
        if os.path.exists(csv_filename):
            print(f"Loading from {csv_filename}")
            df = pd.read_csv(csv_filename)
        elif os.path.exists(filename):
            df = parse_amazon_copurchase(filename)
        else:
            download_file(url, filename)
            df = parse_amazon_copurchase(filename)
        
        if not df.empty:
            G = nx.from_pandas_edgelist(df, "source", "target", create_using=nx.DiGraph())
            amazon_graphs[dataset_name] = G
            print(f"✅ Created {dataset_name} graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
        else:
            print(f"⚠️ Failed to load data for {dataset_name}")

load_graphs()

Loading from amazon_data/amazon0302.txt.csv
✅ Created amazon0302 graph with 30232 nodes and 99997 edges
Loading from amazon_data/amazon0312.txt.csv
✅ Created amazon0312 graph with 28648 nodes and 99997 edges
Loading from amazon_data/amazon0505.txt.csv
✅ Created amazon0505 graph with 22045 nodes and 99997 edges
Loading from amazon_data/amazon0601.txt.csv
✅ Created amazon0601 graph with 26520 nodes and 99997 edges


In [7]:
# --- Cell 7: Gradio Interface ---
def analyze_graph(G):
    """Analyze a graph and return key metrics"""
    analysis = {}
    analysis['num_nodes'] = G.number_of_nodes()
    analysis['num_edges'] = G.number_of_edges()
    analysis['avg_degree'] = sum(dict(G.degree()).values()) / G.number_of_nodes()
    analysis['density'] = nx.density(G)
    try:
        largest_cc = max(nx.weakly_connected_components(G), key=len)
        analysis['largest_cc_percentage'] = (len(largest_cc) / G.number_of_nodes()) * 100
    except:
        analysis['largest_cc_percentage'] = "Not computable"
    return analysis

def create_matplotlib_visualization(graph_name):
    if graph_name not in amazon_graphs:
        return None
    G = amazon_graphs[graph_name]
    plt.figure(figsize=(10, 8))
    pos = nx.spring_layout(G, seed=42) if G.number_of_nodes() < 50 else nx.kamada_kawai_layout(G)
    nx.draw_networkx(G, pos=pos, with_labels=True, node_color='skyblue', node_size=300, 
                    edge_color='gray', arrows=True)
    plt.title(f"Graph: {graph_name}")
    plt.axis('off')
    file_path = os.path.join(tempfile.gettempdir(), f"{graph_name}_plot.png")
    plt.savefig(file_path, format='png', bbox_inches='tight')
    plt.close()
    return file_path

def query_graph(query):
    query = query.lower()
    graph_name = next((name for name in amazon_graphs.keys() if name in query), None)
    if not graph_name:
        return "Please specify a graph (amazon0302, amazon0312, amazon0505, or amazon0601)"
    
    G = amazon_graphs[graph_name]
    
    if "neighbors of" in query or "connected to" in query:
        try:
            node_id = int(re.search(r'node (\d+)', query).group(1))
            neighbors = list(G.neighbors(node_id))
            return f"Neighbors of node {node_id} in {graph_name}: {neighbors}"
        except:
            return "Please specify a valid node number"
    
    elif "shortest path" in query:
        try:
            numbers = [int(x) for x in re.findall(r'\d+', query)]
            source, target = numbers[0], numbers[1]
            path = nx.shortest_path(G, source, target)
            return f"Shortest path from {source} to {target} in {graph_name}: {path}"
        except:
            return "Please specify valid source and target nodes"
    
    return "Query not understood"

def create_gradio_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Amazon Co-Purchase Network Analysis")
        
        # Query Section
        gr.Markdown("### Query the Graph")
        with gr.Row():
            query_input = gr.Textbox(label="Enter your query", 
                                   placeholder="e.g., Find neighbors of node 0 in amazon0302")
            query_button = gr.Button("Submit Query")
        query_output = gr.Textbox(label="Query Result")
        
        # Visualization Section
        gr.Markdown("### Visualize the Graph")
        with gr.Row():
            graph_dropdown = gr.Dropdown(choices=list(amazon_graphs.keys()), 
                                       label="Select a graph", 
                                       value=list(amazon_graphs.keys())[0])
            visualize_button = gr.Button("Visualize Graph")
        visualization_output = gr.Image(label="Graph Visualization", type="filepath")
        
        # Analysis Section
        gr.Markdown("### Analyze the Graph")
        with gr.Row():
            analyze_dropdown = gr.Dropdown(choices=list(amazon_graphs.keys()), 
                                         label="Select a graph", 
                                         value=list(amazon_graphs.keys())[0])
            analyze_button = gr.Button("Analyze Graph")
        analysis_output = gr.Textbox(label="Graph Analysis", max_lines=10)
        
        # Event Handlers
        query_button.click(query_graph, inputs=query_input, outputs=query_output)
        visualize_button.click(create_matplotlib_visualization, 
                             inputs=graph_dropdown, 
                             outputs=visualization_output)
        analyze_button.click(lambda x: str(analyze_graph(amazon_graphs[x])), 
                           inputs=analyze_dropdown, 
                           outputs=analysis_output)
    
    return demo

# Launch the interface
if amazon_graphs:
    demo = create_gradio_interface()
    demo.launch(share=True)
else:
    print("No graphs loaded. Please check data loading steps.")

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://94df39c697add43331.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [8]:
# --- Cell 7: Graph Analysis Functions ---
def analyze_graph(G):
    """Analyze a graph and return key metrics"""
    analysis = {}
    analysis['num_nodes'] = G.number_of_nodes()
    analysis['num_edges'] = G.number_of_edges()
    analysis['avg_degree'] = sum(dict(G.degree()).values()) / G.number_of_nodes()
    analysis['density'] = nx.density(G)
    
    try:
        largest_cc = max(nx.weakly_connected_components(G), key=len)
        analysis['largest_cc_percentage'] = (len(largest_cc) / G.number_of_nodes()) * 100
        analysis['largest_cc_size'] = len(largest_cc)
    except Exception as e:
        analysis['largest_cc_percentage'] = "Not computable"
        analysis['largest_cc_size'] = "Not computable"
    
    try:
        degree_dict = dict(G.degree())
        analysis['top_nodes_by_degree'] = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:5]
    except Exception as e:
        analysis['top_nodes_by_degree'] = "Not computable"
    
    return analysis

def detect_communities(G, max_nodes=5000):
    """Detect communities in the graph using Louvain, with fallback"""
    if G.number_of_nodes() > max_nodes:
        print(f"Graph too large ({G.number_of_nodes()} nodes), sampling {max_nodes} nodes")
        seed_node = max(G.degree, key=lambda x: x[1])[0]
        sample_nodes = set(nx.single_source_shortest_path_length(G, seed_node, cutoff=2).keys())
        subgraph = G.subgraph(sample_nodes)
    else:
        subgraph = G
    
    undirected_G = subgraph.to_undirected()
    
    try:
        import community as community_louvain
        partition = community_louvain.best_partition(undirected_G)
        communities = {}
        for node, comm_id in partition.items():
            communities.setdefault(comm_id, []).append(node)
        sorted_communities = sorted(communities.items(), key=lambda x: len(x[1]), reverse=True)
        return {
            "algorithm": "louvain",
            "num_communities": len(communities),
            "community_sizes": [len(comm) for _, comm in sorted_communities[:5]],
            "top_communities": sorted_communities[:3]
        }
    except ImportError:
        components = list(nx.connected_components(undirected_G))
        sorted_components = sorted(components, key=len, reverse=True)
        return {
            "algorithm": "connected_components",
            "num_communities": len(components),
            "community_sizes": [len(comp) for comp in sorted_components[:5]],
            "top_communities": [(i, list(comp)[:5]) for i, comp in enumerate(sorted_components[:3])]
        }

In [9]:
# --- Cell 8: Visualization Functions ---
def create_matplotlib_visualization(graph_name):
    """Create a matplotlib visualization of the graph"""
    if graph_name not in amazon_graphs:
        return None
    G = amazon_graphs[graph_name]
    
    # Sample for large graphs
    if G.number_of_nodes() > 100:
        seed_node = max(G.degree, key=lambda x: x[1])[0]
        sample_nodes = set(nx.single_source_shortest_path_length(G, seed_node, cutoff=2).keys())
        G = G.subgraph(sample_nodes)
    
    plt.figure(figsize=(10, 8))
    pos = nx.spring_layout(G, seed=42) if G.number_of_nodes() < 50 else nx.kamada_kawai_layout(G)
    nx.draw_networkx(G, pos=pos, with_labels=True, node_color='skyblue', node_size=300,
                    edge_color='gray', arrows=True, font_size=8)
    plt.title(f"Graph: {graph_name} (Sampled if large)")
    plt.axis('off')
    file_path = os.path.join(tempfile.gettempdir(), f"{graph_name}_plot.png")
    plt.savefig(file_path, format='png', bbox_inches='tight')
    plt.close()
    return file_path

In [10]:
# --- Cell 9: Query Function ---
def query_graph(query):
    """Handle graph-related queries"""
    query = query.lower()
    graph_name = next((name for name in amazon_graphs.keys() if name in query), None)
    if not graph_name:
        return "Please specify a graph (amazon0302, amazon0312, amazon0505, or amazon0601)"
    
    G = amazon_graphs[graph_name]
    
    if "neighbors of" in query or "connected to" in query:
        try:
            node_id = int(re.search(r'node (\d+)', query).group(1))
            if node_id not in G.nodes():
                return f"Node {node_id} not found in {graph_name}"
            neighbors = list(G.neighbors(node_id))
            return f"Neighbors of node {node_id} in {graph_name}: {neighbors}"
        except:
            return "Please specify a valid node number"
    
    elif "shortest path" in query:
        try:
            numbers = [int(x) for x in re.findall(r'\d+', query)]
            source, target = numbers[0], numbers[1]
            if source not in G.nodes() or target not in G.nodes():
                return f"Nodes {source} and/or {target} not found in {graph_name}"
            path = nx.shortest_path(G, source, target)
            return f"Shortest path from {source} to {target} in {graph_name}: {path}"
        except nx.NetworkXNoPath:
            return f"No path exists between nodes {source} and {target} in {graph_name}"
        except:
            return "Please specify valid source and target nodes"
    
    elif "degree of" in query:
        try:
            node_id = int(re.search(r'node (\d+)', query).group(1))
            degree = G.degree(node_id)
            return f"Degree of node {node_id} in {graph_name}: {degree}"
        except:
            return "Please specify a valid node number"
    
    return "Query not understood. Try 'neighbors of node X', 'shortest path from X to Y', or 'degree of node X'"

In [11]:
# --- Cell 10: Gradio Interface ---
def display_analysis(graph_name):
    """Display analysis results"""
    if graph_name not in amazon_graphs:
        return "Graph not found"
    analysis = analyze_graph(amazon_graphs[graph_name])
    community = detect_communities(amazon_graphs[graph_name])
    
    return (f"Graph: {graph_name}\n"
            f"Nodes: {analysis['num_nodes']}\n"
            f"Edges: {analysis['num_edges']}\n"
            f"Average Degree: {analysis['avg_degree']:.2f}\n"
            f"Density: {analysis['density']:.4f}\n"
            f"Largest Component: {analysis['largest_cc_percentage']}%\n"
            f"Top 5 Nodes by Degree: {analysis['top_nodes_by_degree']}\n"
            f"Number of Communities: {community['num_communities']}\n"
            f"Top Community Sizes: {community['community_sizes']}")

def create_gradio_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Amazon Co-Purchase Network Analysis")
        
        # Query Section
        gr.Markdown("### Query the Graph")
        with gr.Row():
            query_input = gr.Textbox(label="Enter your query",
                                   placeholder="e.g., Find neighbors of node 0 in amazon0302")
            query_button = gr.Button("Submit Query")
        query_output = gr.Textbox(label="Query Result")
        
        # Visualization Section
        gr.Markdown("### Visualize the Graph")
        with gr.Row():
            graph_dropdown = gr.Dropdown(choices=list(amazon_graphs.keys()),
                                       label="Select a graph",
                                       value=list(amazon_graphs.keys())[0])
            visualize_button = gr.Button("Visualize Graph")
        visualization_output = gr.Image(label="Graph Visualization", type="filepath")
        
        # Analysis Section
        gr.Markdown("### Analyze the Graph")
        with gr.Row():
            analyze_dropdown = gr.Dropdown(choices=list(amazon_graphs.keys()),
                                         label="Select a graph",
                                         value=list(amazon_graphs.keys())[0])
            analyze_button = gr.Button("Analyze Graph")
        analysis_output = gr.Textbox(label="Graph Analysis", max_lines=15)
        
        # Examples
        gr.Markdown("### Example Queries")
        gr.Examples(
            examples=[
                "Find neighbors of node 0 in amazon0302",
                "Shortest path from node 0 to node 5 in amazon0312",
                "Degree of node 10 in amazon0505"
            ],
            inputs=query_input
        )
        
        # Event Handlers
        query_button.click(query_graph, inputs=query_input, outputs=query_output)
        visualize_button.click(create_matplotlib_visualization,
                             inputs=graph_dropdown,
                             outputs=visualization_output)
        analyze_button.click(display_analysis,
                           inputs=analyze_dropdown,
                           outputs=analysis_output)
    
    return demo

# Launch only if graphs are loaded
if amazon_graphs:
    demo = create_gradio_interface()
    demo.launch(share=True)
else:
    print("No graphs loaded. Please check data loading steps.")

* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://0dd8a6e9176e63d0f2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [12]:
# --- Cell 11: LangChain Integration (Optional) ---
from langchain_cohere import ChatCohere
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

cohere_llm = ChatCohere(model="command", cohere_api_key=COHERE_API_KEY, temperature=0)

def llm_query(query):
    """Use LLM to answer complex graph queries"""
    graph_name = next((name for name in amazon_graphs.keys() if name in query.lower()), None)
    if not graph_name:
        return "Please specify a graph in your query"
    
    analysis = analyze_graph(amazon_graphs[graph_name])
    prompt = PromptTemplate(
        template="Given a graph with {num_nodes} nodes, {num_edges} edges, and average degree {avg_degree}, "
                 "answer this question: {query}",
        input_variables=["num_nodes", "num_edges", "avg_degree", "query"]
    )
    chain = LLMChain(llm=cohere_llm, prompt=prompt)
    return chain.run({
        "num_nodes": analysis['num_nodes'],
        "num_edges": analysis['num_edges'],
        "avg_degree": analysis['avg_degree'],
        "query": query
    })

# Add to Gradio interface by modifying Cell 10's query_button.click to:
# query_button.click(lambda q: llm_query(q) if "what" in q.lower() or "how" in q.lower() else query_graph(q),
#                    inputs=query_input, outputs=query_output)

In [13]:
# --- Cell 11: ArangoDB Persistence ---
from typing import Dict, Optional

def persist_networkx_graph_to_arangodb(db, graph_name: str, G: nx.Graph, prefix: str = "AmazonCoPurchase"):
    """Persist a NetworkX graph to ArangoDB"""
    if not db:
        print("⚠️ No database connection provided.")
        return None
    
    arangodb_graph_name = f"{prefix}_{graph_name}"
    nodes_collection_name = f"products_{graph_name}"
    edges_collection_name = f"copurchases_{graph_name}"
    
    try:
        if db.has_graph(arangodb_graph_name):
            graph = db.graph(arangodb_graph_name)
            nodes = graph.vertex_collection(nodes_collection_name)
            edges = graph.edge_collection(edges_collection_name)
        else:
            graph = db.create_graph(arangodb_graph_name)
            nodes = graph.create_vertex_collection(nodes_collection_name)
            edges = graph.create_edge_definition(
                edge_collection=edges_collection_name,
                from_vertex_collections=[nodes_collection_name],
                to_vertex_collections=[nodes_collection_name],
            )
        
        # Batch process nodes
        node_batch = [{'_key': str(node)} for node in G.nodes()]
        nodes.import_bulk(node_batch, on_duplicate="replace")
        
        # Batch process edges
        edge_batch = [
            {"_from": f"{nodes_collection_name}/{source}", 
             "_to": f"{nodes_collection_name}/{target}"}
            for source, target in G.edges()
        ]
        edges.import_bulk(edge_batch, on_duplicate="replace")
        
        print(f"✅ Persisted {graph_name} to ArangoDB")
        return nxadb.Graph(name=arangodb_graph_name, db=db)
    except Exception as e:
        print(f"Error persisting graph: {e}")
        return None

def persist_all_graphs():
    """Persist all graphs to ArangoDB"""
    if not db:
        return {}
    
    adb_graphs = {}
    for graph_name, G in amazon_graphs.items():
        adb_graph = persist_networkx_graph_to_arangodb(db, graph_name, G)
        if adb_graph:
            adb_graphs[graph_name] = adb_graph
    return adb_graphs

# Persist graphs
adb_graphs = persist_all_graphs()

✅ Persisted amazon0302 to ArangoDB


[12:28:07 +0530] [INFO]: Graph 'AmazonCoPurchase_amazon0302' exists.
[12:28:08 +0530] [INFO]: Default node type set to 'products_amazon0302'


✅ Persisted amazon0312 to ArangoDB


[12:28:25 +0530] [INFO]: Graph 'AmazonCoPurchase_amazon0312' exists.
[12:28:26 +0530] [INFO]: Default node type set to 'products_amazon0312'


✅ Persisted amazon0505 to ArangoDB


[12:28:49 +0530] [INFO]: Graph 'AmazonCoPurchase_amazon0505' exists.
[12:28:50 +0530] [INFO]: Default node type set to 'products_amazon0505'


✅ Persisted amazon0601 to ArangoDB


[12:29:07 +0530] [INFO]: Graph 'AmazonCoPurchase_amazon0601' exists.
[12:29:08 +0530] [INFO]: Default node type set to 'products_amazon0601'


In [14]:
# --- Cell 12: LangChain Setup and Agent ---
from langchain_cohere import ChatCohere
from langchain_community.graphs import ArangoGraph
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.tools import tool
from langchain.agents import create_react_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage  # Import message types

cohere_llm = ChatCohere(model="command", cohere_api_key=COHERE_API_KEY, temperature=0)
arango_graph = ArangoGraph(db=db) if db else None

@tool
def arango_query(query: str):
    """Execute AQL queries via LangChain"""
    if not arango_graph:
        return "ArangoDB not available"
    chain = ArangoGraphQAChain.from_llm(llm=cohere_llm, graph=arango_graph, verbose=True)
    return str(chain.invoke({"query": query})["result"])

@tool
def networkx_query(query: str):
    """Execute NetworkX queries"""
    graph_name = next((name for name in amazon_graphs.keys() if name in query.lower()), None)
    if not graph_name:
        return "Please specify a graph name"
    
    G = amazon_graphs[graph_name]
    prompt = PromptTemplate(
        template="Using a NetworkX DiGraph with {num_nodes} nodes and {num_edges} edges, "
                 "write Python code to answer: {query}. Set FINAL_RESULT to the answer.",
        input_variables=["num_nodes", "num_edges", "query"]
    )
    chain = LLMChain(llm=cohere_llm, prompt=prompt)
    code = chain.invoke({"num_nodes": G.number_of_nodes(), "num_edges": G.number_of_edges(), "query": query})["text"]
    
    try:
        local_vars = {}
        exec(code, {"nx": nx, "nx_graph": G}, local_vars)
        return str(local_vars.get("FINAL_RESULT", "No result computed"))
    except Exception as e:
        return f"Error executing code: {e}"

tools = [arango_query, networkx_query]

# Corrected prompt with explicit instructions and handling of agent_scratchpad
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a graph analysis assistant. Use the following tools to answer queries about Amazon co-purchase networks:

Available tools: {tools}
Tool names: {tool_names}

Answer in a step-by-step format showing your reasoning. Use the tools provided to compute answers. For example:

Query: "What are the neighbors of node 0 in amazon0302?"
Thought: I need to find the neighbors of a specific node in a NetworkX graph. The networkx_query tool is appropriate.
Action: networkx_query
Action Input: "What are the neighbors of node 0 in amazon0302?"

The agent_scratchpad will contain the history of your reasoning and actions as a list of messages. Use it to track your progress.
"""),
    MessagesPlaceholder(variable_name="agent_scratchpad"),  # This expects a list of messages
    ("human", "{input}"),  # Changed "user" to "human" for consistency with LangChain
])

agent = create_react_agent(cohere_llm, tools, prompt=prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)

def enhanced_query(query):
    """Enhanced query function using LangChain agent"""
    # Initialize agent_scratchpad as an empty list of messages
    try:
        result = agent_executor.invoke({
            "input": query,
            "agent_scratchpad": []  # Start with an empty list of messages
        })
        return result["output"]
    except Exception as e:
        return f"Error processing query: {e}"

In [15]:
# --- Cell 13: Main Analysis and Storage ---
graph_analyses = {}
community_analyses = {}

def run_analysis():
    """Run analysis on all graphs"""
    for graph_name, G in amazon_graphs.items():
        print(f"Analyzing {graph_name}...")
        graph_analyses[graph_name] = analyze_graph(G)
        community_analyses[graph_name] = detect_communities(G)

if amazon_graphs:
    run_analysis()
    print("✅ Analysis complete")
else:
    print("⚠️ No graphs to analyze")


Analyzing amazon0302...
Graph too large (30232 nodes), sampling 5000 nodes
Analyzing amazon0312...
Graph too large (28648 nodes), sampling 5000 nodes
Analyzing amazon0505...
Graph too large (22045 nodes), sampling 5000 nodes
Analyzing amazon0601...
Graph too large (26520 nodes), sampling 5000 nodes
✅ Analysis complete


In [16]:
# --- Cell 14: Enhanced Gradio Interface ---
def create_enhanced_gradio_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Amazon Co-Purchase Network Analysis")
        
        # Query Section
        gr.Markdown("### Query the Graph")
        with gr.Row():
            query_input = gr.Textbox(label="Enter your query",
                                   placeholder="e.g., What are the neighbors of node 0 in amazon0302?")
            query_button = gr.Button("Submit Query")
        query_output = gr.Textbox(label="Query Result")
        
        # Visualization Section
        gr.Markdown("### Visualize the Graph")
        with gr.Row():
            graph_dropdown = gr.Dropdown(choices=list(amazon_graphs.keys()),
                                       label="Select a graph",
                                       value=list(amazon_graphs.keys())[0])
            visualize_button = gr.Button("Visualize Graph")
        visualization_output = gr.Image(label="Graph Visualization", type="filepath")
        
        # Analysis Section
        gr.Markdown("### Analyze the Graph")
        with gr.Row():
            analyze_dropdown = gr.Dropdown(choices=list(amazon_graphs.keys()),
                                         label="Select a graph",
                                         value=list(amazon_graphs.keys())[0])
            analyze_button = gr.Button("Analyze Graph")
        analysis_output = gr.Textbox(label="Graph Analysis", max_lines=15)
        
        # Examples
        gr.Markdown("### Example Queries")
        gr.Examples(
            examples=[
                "What are the neighbors of node 0 in amazon0302?",
                "Find the shortest path from node 0 to node 5 in amazon0312",
                "What is the degree distribution in amazon0505?",
                "How many communities are in amazon0601?"
            ],
            inputs=query_input
        )
        
        # Event Handlers
        query_button.click(enhanced_query, inputs=query_input, outputs=query_output)
        visualize_button.click(create_matplotlib_visualization,
                             inputs=graph_dropdown,
                             outputs=visualization_output)
        analyze_button.click(display_analysis,
                           inputs=analyze_dropdown,
                           outputs=analysis_output)
    
    return demo

# Launch enhanced interface
if amazon_graphs:
    demo = create_enhanced_gradio_interface()
    demo.launch(share=True)
else:
    print("No graphs loaded. Please check data loading steps.")

* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://1b4cffebc047a8e320.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [17]:
# --- Cell 15: Visualization of Analysis Results ---
def plot_degree_distribution(graph_name):
    """Plot degree distribution for a graph"""
    if graph_name not in amazon_graphs:
        return None
    
    G = amazon_graphs[graph_name]
    degrees = [d for n, d in G.degree()]
    plt.figure(figsize=(10, 6))
    plt.hist(degrees, bins=50, log=True)
    plt.title(f"Degree Distribution - {graph_name}")
    plt.xlabel("Degree")
    plt.ylabel("Frequency (log scale)")
    file_path = os.path.join(tempfile.gettempdir(), f"{graph_name}_degree_dist.png")
    plt.savefig(file_path)
    plt.close()
    return file_path

# Example usage
if amazon_graphs:
    for graph_name in amazon_graphs.keys():
        dist_plot = plot_degree_distribution(graph_name)
        if dist_plot:
            print(f"Degree distribution plot saved for {graph_name}: {dist_plot}")

Degree distribution plot saved for amazon0302: /tmp/amazon0302_degree_dist.png
Degree distribution plot saved for amazon0312: /tmp/amazon0312_degree_dist.png
Degree distribution plot saved for amazon0505: /tmp/amazon0505_degree_dist.png
Degree distribution plot saved for amazon0601: /tmp/amazon0601_degree_dist.png


In [18]:
# --- Cell 16: Metadata Integration ---
def parse_amazon_metadata(gz_file, sample_size=10000):
    """Parse Amazon metadata from gzipped JSON file"""
    print(f"Parsing metadata from {gz_file}...")
    products = []
    try:
        with gzip.open(gz_file, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(tqdm(f, desc="Reading metadata", unit=" lines")):
                if line.strip():
                    try:
                        product = json.loads(line)
                        products.append(product)
                    except json.JSONDecodeError:
                        continue
                if i >= sample_size:
                    break
    except Exception as e:
        print(f"Error parsing metadata: {e}")
        return pd.DataFrame()
    
    df = pd.DataFrame(products)
    if 'asin' in df.columns:
        df['asin'] = df['asin'].astype(str)
        csv_filename = gz_file.replace('.gz', '.csv')
        df.to_csv(csv_filename, index=False)
        print(f"✅ Saved metadata to {csv_filename}")
    return df

def add_metadata_to_graphs():
    """Add metadata to existing graphs"""
    metadata_url = "http://snap.stanford.edu/data/amazon/productGraph/metadata.json.gz"
    filename = os.path.join("amazon_data", "metadata.json.gz")
    csv_filename = filename.replace('.gz', '.csv')
    
    if os.path.exists(csv_filename):
        metadata_df = pd.read_csv(csv_filename)
    elif os.path.exists(filename):
        metadata_df = parse_amazon_metadata(filename)
    else:
        download_file(metadata_url, filename)
        metadata_df = parse_amazon_metadata(filename)
    
    if not metadata_df.empty and 'asin' in metadata_df.columns:
        for graph_name, G in amazon_graphs.items():
            print(f"Adding metadata to {graph_name}...")
            for node in tqdm(G.nodes(), desc="Updating nodes"):
                node_str = str(node)
                metadata = metadata_df[metadata_df['asin'] == node_str]
                if not metadata.empty:
                    for col in metadata.columns:
                        if col != 'asin':
                            value = metadata[col].iloc[0]
                            if pd.notna(value):
                                G.nodes[node][col] = value
            # Re-persist with metadata
            if db:
                persist_networkx_graph_to_arangodb(db, graph_name, G)

if amazon_graphs:
    add_metadata_to_graphs()

Parsing metadata from amazon_data/metadata.json.gz...


Reading metadata: 9430088 lines [01:13, 128991.42 lines/s]


In [19]:
# --- Cell 16: Metadata Integration ---
def parse_amazon_metadata(gz_file, sample_size=10000):
    """Parse Amazon metadata from gzipped JSON file"""
    print(f"Parsing metadata from {gz_file}...")
    products = []
    try:
        with gzip.open(gz_file, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(tqdm(f, desc="Reading metadata", unit=" lines")):
                if line.strip():
                    try:
                        product = json.loads(line)
                        products.append(product)
                    except json.JSONDecodeError:
                        continue
                if i >= sample_size:
                    break
    except Exception as e:
        print(f"Error parsing metadata: {e}")
        return pd.DataFrame()
    
    df = pd.DataFrame(products)
    if 'asin' in df.columns:
        df['asin'] = df['asin'].astype(str)
        csv_filename = gz_file.replace('.gz', '.csv')
        df.to_csv(csv_filename, index=False)
        print(f"✅ Saved metadata to {csv_filename}")
    return df

def add_metadata_to_graphs():
    """Add metadata to existing graphs"""
    metadata_url = "http://snap.stanford.edu/data/amazon/productGraph/metadata.json.gz"
    filename = os.path.join("amazon_data", "metadata.json.gz")
    csv_filename = filename.replace('.gz', '.csv')
    
    if os.path.exists(csv_filename):
        metadata_df = pd.read_csv(csv_filename)
    elif os.path.exists(filename):
        metadata_df = parse_amazon_metadata(filename)
    else:
        download_file(metadata_url, filename)
        metadata_df = parse_amazon_metadata(filename)
    
    if not metadata_df.empty and 'asin' in metadata_df.columns:
        for graph_name, G in amazon_graphs.items():
            print(f"Adding metadata to {graph_name}...")
            for node in tqdm(G.nodes(), desc="Updating nodes"):
                node_str = str(node)
                metadata = metadata_df[metadata_df['asin'] == node_str]
                if not metadata.empty:
                    for col in metadata.columns:
                        if col != 'asin':
                            value = metadata[col].iloc[0]
                            if pd.notna(value):
                                G.nodes[node][col] = value
            # Re-persist with metadata
            if db:
                persist_networkx_graph_to_arangodb(db, graph_name, G)

if amazon_graphs:
    add_metadata_to_graphs()

Parsing metadata from amazon_data/metadata.json.gz...


Reading metadata: 9430088 lines [01:12, 129240.77 lines/s]


In [26]:
# --- Cell 17: Community Visualization ---
def plot_community_distribution(graph_name):
    """Visualize community size distribution"""
    if graph_name not in community_analyses:
        return None
    
    comm_data = community_analyses[graph_name]
    sizes = comm_data['community_sizes']
    
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(sizes)), sizes)
    plt.title(f"Community Size Distribution - {graph_name}")
    plt.xlabel("Community Index (Top 5)")
    plt.ylabel("Size")
    file_path = os.path.join(tempfile.gettempdir(), f"{graph_name}_comm_dist.png")
    plt.savefig(file_path)
    plt.close()
    return file_path

# Example usage
if community_analyses:
    for graph_name in community_analyses.keys():
        comm_plot = plot_community_distribution(graph_name)
        if comm_plot:
            print(f"Community distribution plot saved for {graph_name}: {comm_plot}")

Community distribution plot saved for amazon0302: /tmp/amazon0302_comm_dist.png
Community distribution plot saved for amazon0312: /tmp/amazon0312_comm_dist.png
Community distribution plot saved for amazon0505: /tmp/amazon0505_comm_dist.png
Community distribution plot saved for amazon0601: /tmp/amazon0601_comm_dist.png


In [27]:
# --- Cell 18: Export Results ---
import json

def export_analysis_results():
    """Export analysis results to JSON"""
    results = {
        "graph_analyses": {name: analysis for name, analysis in graph_analyses.items()},  # No vars() needed
        "community_analyses": {name: comm for name, comm in community_analyses.items()}   # Already a dict
    }
    
    output_file = "amazon_graph_analysis.json"
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"✅ Exported analysis results to {output_file}")

if graph_analyses and community_analyses:
    export_analysis_results()

✅ Exported analysis results to amazon_graph_analysis.json


In [None]:
# --- Cell 19: Final Gradio Interface with Enhanced Aesthetics and Robust Port Handling ---
import gradio as gr
from gradio.themes import Soft
import matplotlib.pyplot as plt
import socket

# Custom theme with a modern, sleek look
custom_theme = Soft(
    primary_hue="indigo",
    secondary_hue="purple",
    neutral_hue="gray",
    text_size="lg",
    radius_size="md",
    spacing_size="lg",
).set(
    body_background_fill="#f5f7fa",
    body_background_fill_dark="#1a1b26",
    button_primary_background_fill="#4f46e5",
    button_primary_text_color="#ffffff",
    button_secondary_background_fill="#9333ea",
    block_background_fill="#ffffff",
    block_border_color="#e0e7ff",
    block_shadow="0 4px 6px rgba(0, 0, 0, 0.1)",
)

def find_free_port(start_port=7860, max_attempts=10):
    """Find an available port starting from start_port."""
    port = start_port
    for _ in range(max_attempts):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            try:
                s.bind(("0.0.0.0", port))
                return port
            except OSError:
                port += 1
    raise OSError(f"No free ports found in range {start_port}-{start_port + max_attempts - 1}")

def create_final_gradio_interface():
    with gr.Blocks(
        title="Amazon Co-Purchase Network Explorer",
        theme=custom_theme,
        css="""
            .gradio-container { max-width: 1200px; margin: auto; }
            h1 { font-family: 'Arial', sans-serif; font-weight: bold; color: #4f46e5; text-align: center; }
            .section-header { color: #9333ea; font-size: 1.5em; margin-bottom: 10px; }
            .example-box { background-color: #eef2ff; padding: 10px; border-radius: 8px; }
            .output-box { border: 2px solid #e0e7ff; border-radius: 8px; padding: 10px; }
        """
    ) as demo:
        gr.Markdown(
            """
            # Amazon Co-Purchase Network Explorer
            Dive into the world of Amazon product relationships with cutting-edge graph analysis!
            """,
            elem_classes=["header"]
        )

        with gr.Tabs():
            with gr.Tab("Query Explorer", elem_id="query-tab"):
                gr.Markdown("### Unleash Your Questions", elem_classes=["section-header"])
                with gr.Row():
                    with gr.Column(scale=3):
                        query_input = gr.Textbox(
                            label="Ask Away",
                            placeholder="Try: 'What are the neighbors of node 0 in amazon0302?'",
                            lines=2,
                            show_label=True,
                            elem_classes=["input-box"]
                        )
                    with gr.Column(scale=1):
                        query_button = gr.Button("Explore", variant="primary")
                query_output = gr.Textbox(
                    label="Discovery Zone",
                    lines=5,
                    show_copy_button=True,
                    elem_classes=["output-box"]
                )
                gr.Markdown("#### Cool Query Ideas", elem_classes=["section-header"])
                with gr.Group(elem_classes=["example-box"]):
                    gr.Examples(
                        examples=[
                            "What are the neighbors of node 0 in amazon0302?",
                            "Find the shortest path from node 0 to node 5 in amazon0312",
                            "What is the degree distribution in amazon0505?",
                            "How many communities are in amazon0601?",
                            "What products are frequently co-purchased with node 10 in amazon0302?"
                        ],
                        inputs=query_input
                    )

            with gr.Tab("Visual Insights", elem_id="viz-tab"):
                gr.Markdown("### See the Network in Action", elem_classes=["section-header"])
                with gr.Row():
                    graph_dropdown = gr.Dropdown(
                        choices=list(amazon_graphs.keys()),
                        label="Pick Your Graph",
                        value=list(amazon_graphs.keys())[0],
                        elem_classes=["input-box"]
                    )
                    viz_type = gr.Radio(
                        ["Graph", "Degree Distribution", "Community Distribution"],
                        label="Visualization Style",
                        value="Graph",
                        elem_classes=["input-box"]
                    )
                    visualize_button = gr.Button("Reveal", variant="secondary")
                visualization_output = gr.Image(
                    label="Network Vision",
                    type="filepath",
                    elem_classes=["output-box"],
                    height=500
                )
                gr.Markdown(
                    "*Tip: Switch between styles to uncover different perspectives!*",
                    elem_classes=["tip"]
                )

            with gr.Tab("Deep Analysis", elem_id="analysis-tab"):
                gr.Markdown("### Crunch the Numbers", elem_classes=["section-header"])
                with gr.Row():
                    analyze_dropdown = gr.Dropdown(
                        choices=list(amazon_graphs.keys()),
                        label="Choose Graph to Analyze",
                        value=list(amazon_graphs.keys())[0],
                        elem_classes=["input-box"]
                    )
                    analyze_button = gr.Button("Analyze", variant="primary")
                analysis_output = gr.Textbox(
                    label="Insights Unveiled",
                    lines=10,
                    show_copy_button=True,
                    elem_classes=["output-box"]
                )
                gr.Markdown(
                    "*Dive into node counts, degrees, and community structures!*",
                    elem_classes=["tip"]
                )

        # Event Handlers
        def viz_selector(graph_name, viz_type):
            if viz_type == "Graph":
                return create_matplotlib_visualization(graph_name)
            elif viz_type == "Degree Distribution":
                return plot_degree_distribution(graph_name)
            else:
                return plot_community_distribution(graph_name)

        query_button.click(
            enhanced_query,
            inputs=query_input,
            outputs=query_output
        )
        visualize_button.click(
            viz_selector,
            inputs=[graph_dropdown, viz_type],
            outputs=visualization_output
        )
        analyze_button.click(
            display_analysis,
            inputs=analyze_dropdown,
            outputs=analysis_output
        )

    return demo

# Launch the masterpiece with dynamic port selection
if amazon_graphs:
    try:
        demo = create_final_gradio_interface()
        port = find_free_port(start_port=7860)  # Start at 7860, increment if needed
        print(f"Launching on port {port}...")
        demo.launch(share=True, server_name="0.0.0.0", server_port=port)
    except OSError as e:
        print(f"Failed to launch Gradio: {e}")
        print("Please free up ports or try a different range. Run 'lsof -i :7860' to check port usage.")
else:
    print("No graphs loaded. Please check data loading steps.")

Launching on port 7864...
* Running on local URL:  http://0.0.0.0:7864
* Running on public URL: https://448c0e3b00ee03c2e5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




[1m> Entering new AgentExecutor chain...[0m
Graph too large (30232 nodes), sampling 5000 nodes


[1m> Entering new AgentExecutor chain...[0m
