In [18]:
# Install the required packages correctly
import sys
!{sys.executable} -m pip install python-arango networkx pandas gdown
!{sys.executable} -m pip install langchain langchain-openai langchain-community openai



In [19]:
import sys
import subprocess
import importlib

import networkx as nx
from arango import ArangoClient
# Update these langchain imports
from langchain_openai import OpenAI  # Or use this
# Alternatively, you might need to use:
# from langchain_community.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import pandas as pd
import json
import gdown

In [20]:
def install_and_import(package):
    try:
        importlib.import_module(package)
        print(f"✅ {package} is already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ Successfully installed {package}")

# Install required packages
required_packages = ["python-arango", "networkx", "langchain", "gdown", "pandas", "openai"]
for package in required_packages:
    install_and_import(package)

# Note about cugraph
print("Note: cugraph requires CUDA. If you don't have a GPU, we'll use NetworkX instead.")
try:
    install_and_import("cugraph")
    use_cugraph = True
except:
    print("⚠️ Could not install cugraph. Using NetworkX for all graph operations.")
    use_cugraph = False

# Import required libraries after installation
import networkx as nx
from arango import ArangoClient
from langchain.llms import OpenAI
from langchain import PromptTemplate, LLMChain
import pandas as pd
import json
import gdown

print("All dependencies imported successfully!")


Installing python-arango...
✅ Successfully installed python-arango
✅ networkx is already installed
✅ langchain is already installed
✅ gdown is already installed
✅ pandas is already installed
✅ openai is already installed
Note: cugraph requires CUDA. If you don't have a GPU, we'll use NetworkX instead.
Installing cugraph...
⚠️ Could not install cugraph. Using NetworkX for all graph operations.
All dependencies imported successfully!


In [21]:
# Step 1: Setup & Dependencies
import sys
import subprocess
import importlib

# Function to install and import a package
def install_and_import(package):
    try:
        importlib.import_module(package)
        print(f"✅ {package} is already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ Successfully installed {package}")

# Install required packages
required_packages = ["python-arango", "networkx", "pandas", "gdown", "requests", "tqdm"]
# Update langchain packages to use the newer structure
langchain_packages = ["langchain", "langchain-openai", "langchain-community"]

for package in required_packages:
    install_and_import(package)
for package in langchain_packages:
    install_and_import(package)

# Import required libraries after installation
import networkx as nx
from arango import ArangoClient
# Updated imports for langchain
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import pandas as pd
import json
import gdown
import requests
import os
from tqdm import tqdm
import gzip
import time

print("All dependencies imported successfully!")

Installing python-arango...
✅ Successfully installed python-arango
✅ networkx is already installed
✅ pandas is already installed
✅ gdown is already installed
✅ requests is already installed
✅ tqdm is already installed
✅ langchain is already installed
Installing langchain-openai...
✅ Successfully installed langchain-openai
Installing langchain-community...
✅ Successfully installed langchain-community
All dependencies imported successfully!


In [23]:
print("\nSetting up Amazon SNAP dataset downloads...")

def download_file(url, filename):
    """Download a file with progress bar"""
    # This function is only called if the file is missing, so no need to check if it exists here
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx status codes)
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024  # 1 Kibibyte

        with open(filename, 'wb') as file, tqdm(
                desc=filename,
                total=total_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
            for data in response.iter_content(block_size):
                size = file.write(data)
                bar.update(size)

        print(f"✅ Downloaded {filename}")
        return filename
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error downloading file: {e}")
        return None  # Indicate failure
    except Exception as e:
        print(f"⚠️ An unexpected error occurred during download: {e}")
        return None # Indicate failure

def parse_amazon_metadata(gz_file):
    """Parse Amazon metadata from gzipped file"""
    print(f"Parsing metadata from {gz_file}...")
    products = []
    current_product = {}

    try:
        with gzip.open(gz_file, 'rt', encoding='latin1') as f:
            lines = []
            for i, line in enumerate(tqdm(f, desc="Reading lines")):
                line = line.strip()
                if line == '':
                    if current_product:
                        products.append(current_product)
                        current_product = {}
                else:
                    if ':' in line:
                        key, value = line.split(':', 1)
                        current_product[key.strip()] = value.strip()

                # For testing, limit to a sample
                if len(products) >= 10000:  # Adjust this number as needed
                    break

        print(f"✅ Parsed {len(products)} products")

        # Convert to DataFrame
        df = pd.DataFrame(products)

        # Clean up and prepare for graph analysis
        if 'ASIN' in df.columns:
            df['ASIN'] = df['ASIN'].astype(str)

        # Save a CSV version for easier reuse
        csv_file = gz_file.replace('.gz', '.csv')
        df.to_csv(csv_file, index=False)
        print(f"✅ Saved to {csv_file}")

        return df
    except Exception as e:
        print(f"⚠️ Error parsing metadata file: {e}")
        return None

def parse_amazon_copurchase(gz_file):
    """Parse Amazon co-purchasing network from gzipped file"""
    print(f"Parsing co-purchase network from {gz_file}...")
    edges = []

    try:
        with gzip.open(gz_file, 'rt', encoding='latin1') as f:
            for i, line in enumerate(tqdm(f, desc="Reading edges")):
                if not line.startswith('#'):
                    source, target = line.strip().split()
                    edges.append((source, target))

                # For testing, limit to a sample
                if len(edges) >= 100000:  # Adjust this number as needed
                    break

        print(f"✅ Parsed {len(edges)} co-purchase edges")

        # Convert to DataFrame
        df = pd.DataFrame(edges, columns=['source', 'target'])

        # Save a CSV version for easier reuse
        csv_file = gz_file.replace('.gz', '.csv')
        df.to_csv(csv_file, index=False)
        print(f"✅ Saved to {csv_file}")

        return df
    except Exception as e:
        print(f"⚠️ Error parsing co-purchase network: {e}")
        return None

# SNAP Amazon Dataset URLs - choose based on your needs
amazon_datasets = {
    "metadata": "http://snap.stanford.edu/data/amazon/productGraph/metadata.json.gz",
    "copurchase": "http://snap.stanford.edu/data/amazon0601.txt.gz",
    # For a smaller dataset, you can use category-specific ones:
    "books": "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Books.csv",
    "electronics": "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Electronics.csv"
}

# Create data directory if it doesn't exist
data_dir = "amazon_data"
os.makedirs(data_dir, exist_ok=True)

# Download and process datasets
print("\nDownloading and processing Amazon SNAP datasets...")
datasets = {}

# Load metadata.csv directly if it exists, otherwise, load from gz and create a CSV.
metadata_csv_file = os.path.join(data_dir, "metadata.csv")
metadata_gz_file = os.path.join(data_dir, "metadata.json.gz")

if os.path.exists(metadata_csv_file):
    print("✅ Using existing metadata CSV file.")
    try:
        datasets["metadata"] = pd.read_csv(metadata_csv_file)
    except pd.errors.EmptyDataError:
        print("⚠️ The metadata CSV file is empty!")
        datasets["metadata"] = pd.DataFrame()  # Create an empty dataframe
else:
    print("Metadata CSV file not found. Attempting to parse from gz file...")
    if not os.path.exists(metadata_gz_file):
        download_file(amazon_datasets["metadata"], metadata_gz_file)
    metadata_df = parse_amazon_metadata(metadata_gz_file)
    if metadata_df is not None:
        datasets["metadata"] = metadata_df
    else:
        print("⚠️ Could not parse metadata from gz file. Metadata will be unavailable.")
        datasets["metadata"] = pd.DataFrame() # or some other default if needed


# Load copurchase data. Load from CSV if available, otherwise load from gz and create.
copurchase_csv_file = os.path.join(data_dir, "amazon0601.txt.csv")
copurchase_gz_file = os.path.join(data_dir, "amazon0601.txt.gz")

if os.path.exists(copurchase_csv_file):
    print("✅ Using existing copurchase CSV file.")
    try:
        datasets["copurchase"] = pd.read_csv(copurchase_csv_file)
    except pd.errors.EmptyDataError:
        print("⚠️ The copurchase CSV file is empty!")
        datasets["copurchase"] = pd.DataFrame(columns=['source', 'target']) # or some other default if needed
else:
    print("Co-purchase CSV file not found. Attempting to parse from gz file...")
    if not os.path.exists(copurchase_gz_file):
        download_file(amazon_datasets["copurchase"], copurchase_gz_file)
    copurchase_df = parse_amazon_copurchase(copurchase_gz_file)
    if copurchase_df is not None:
        datasets["copurchase"] = copurchase_df
    else:
        print("⚠️ Could not parse copurchase data from gz file. Co-purchase data will be unavailable.")
        datasets["copurchase"] = pd.DataFrame(columns=['source', 'target']) # or some other default if needed


Setting up Amazon SNAP dataset downloads...

Downloading and processing Amazon SNAP datasets...
Metadata CSV file not found. Attempting to parse from gz file...
Parsing metadata from amazon_data/metadata.json.gz...


Reading lines: 9430088it [02:15, 69358.44it/s]

✅ Parsed 0 products
✅ Saved to amazon_data/metadata.json.csv
✅ Using existing copurchase CSV file.





In [24]:
print("\nPreparing Amazon product graph...")

def prepare_amazon_graph(copurchase_df, metadata_df=None):
    """Transform Amazon dataset into a graph structure"""
    G = nx.DiGraph()

    # Add edges from co-purchase data
    for _, row in tqdm(copurchase_df.iterrows(), total=len(copurchase_df), desc="Adding edges"):
        G.add_edge(str(row['source']), str(row['target']))

    # Add node attributes from metadata if available
    if metadata_df is not None and 'ASIN' in metadata_df.columns:
        print("Adding product metadata to nodes...")
        for _, row in tqdm(metadata_df.iterrows(), total=len(metadata_df), desc="Adding metadata"):
            asin = str(row['ASIN'])
            if asin in G:
                # Add attributes from metadata
                for col in metadata_df.columns:
                    if col != 'ASIN' and pd.notna(row[col]):
                        G.nodes[asin][col] = row[col]

    print(f"✅ Created Amazon product graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    return G

# Create the Amazon product graph
amazon_graph = prepare_amazon_graph(
    datasets["copurchase"],
    datasets.get("metadata") if "metadata" in datasets else None
)



Preparing Amazon product graph...


Adding edges: 100%|██████████| 100000/100000 [00:04<00:00, 22755.20it/s]

✅ Created Amazon product graph with 26520 nodes and 100000 edges





In [25]:
print("\nPerforming basic graph analysis...")

def analyze_graph(G):
    """Perform basic analysis on the graph"""
    analysis = {}

    # Basic statistics
    analysis["num_nodes"] = G.number_of_nodes()
    analysis["num_edges"] = G.number_of_edges()

    # Compute degree statistics (this can be slow for large graphs)
    print("Computing degree statistics...")
    degrees = [d for n, d in G.degree()]
    analysis["avg_degree"] = sum(degrees) / len(degrees)
    analysis["max_degree"] = max(degrees)

    # Identify top nodes by degree (potential influential products)
    print("Finding most connected products...")
    degree_dict = dict(G.degree())
    top_nodes = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:10]
    analysis["top_nodes_by_degree"] = top_nodes

    # Extract largest connected component
    print("Finding largest connected component...")
    largest_cc = max(nx.weakly_connected_components(G), key=len)
    analysis["largest_cc_size"] = len(largest_cc)
    analysis["largest_cc_percentage"] = len(largest_cc) / G.number_of_nodes() * 100

    # Sample a small subgraph for visualization and detailed analysis
    print("Creating sample subgraph for detailed analysis...")
    seed_node = top_nodes[0][0]  # Use the highest degree node as seed
    sample_nodes = set([seed_node])
    frontier = set([seed_node])

    # BFS to get a neighborhood around the seed
    while len(sample_nodes) < 100 and frontier:
        new_frontier = set()
        for node in frontier:
            neighbors = set(G.neighbors(node))
            new_nodes = neighbors - sample_nodes
            sample_nodes.update(list(new_nodes)[:5])  # Limit to 5 new neighbors per node
            new_frontier.update(list(new_nodes)[:5])
            if len(sample_nodes) >= 100:
                break
        frontier = new_frontier

    sample_subgraph = G.subgraph(sample_nodes)
    analysis["sample_subgraph"] = sample_subgraph
    analysis["sample_subgraph_size"] = sample_subgraph.number_of_nodes()

    print(f"✅ Completed basic graph analysis")
    return analysis

# Run the analysis
graph_analysis = analyze_graph(amazon_graph)

# Print some findings
print("\nAmazon Product Network Analysis Results:")
print(f"Total products (nodes): {graph_analysis['num_nodes']:,}")
print(f"Total co-purchase links (edges): {graph_analysis['num_edges']:,}")
print(f"Average connections per product: {graph_analysis['avg_degree']:.2f}")
print(f"Maximum connections for a product: {graph_analysis['max_degree']}")
print(f"Largest connected component contains {graph_analysis['largest_cc_percentage']:.2f}% of products")

print("\nTop 10 most connected products (potential influencers):")
for i, (node, degree) in enumerate(graph_analysis['top_nodes_by_degree'], 1):
    print(f"{i}. Product {node}: {degree} connections")


Performing basic graph analysis...
Computing degree statistics...
Finding most connected products...
Finding largest connected component...
Creating sample subgraph for detailed analysis...
✅ Completed basic graph analysis

Amazon Product Network Analysis Results:
Total products (nodes): 26,520
Total co-purchase links (edges): 100,000
Average connections per product: 7.54
Maximum connections for a product: 139
Largest connected component contains 100.00% of products

Top 10 most connected products (potential influencers):
1. Product 36: 139 connections
2. Product 5: 134 connections
3. Product 89: 121 connections
4. Product 41: 115 connections
5. Product 44: 103 connections
6. Product 50: 100 connections
7. Product 48: 99 connections
8. Product 406: 97 connections
9. Product 1862: 94 connections
10. Product 90: 92 connections


In [26]:
print("\nPerforming community detection...")

def detect_communities(G, max_nodes=5000):
    """Detect communities in the graph"""
    # For large graphs, sample a subgraph
    if G.number_of_nodes() > max_nodes:
        print(f"Graph is large ({G.number_of_nodes()} nodes), sampling {max_nodes} nodes for community detection...")
        # Use the sample subgraph from analysis
        subgraph = graph_analysis["sample_subgraph"]
    else:
        subgraph = G

    # Convert to undirected for community detection algorithms
    undirected_G = subgraph.to_undirected()

    print("Running Louvain community detection...")
    try:
        # Try to import community module (python-louvain)
        import community as community_louvain
        partition = community_louvain.best_partition(undirected_G)

        # Count communities and their sizes
        communities = {}
        for node, community_id in partition.items():
            if community_id not in communities:
                communities[community_id] = []
            communities[community_id].append(node)

        # Sort by community size
        sorted_communities = sorted(communities.items(), key=lambda x: len(x[1]), reverse=True)

        print(f"✅ Detected {len(communities)} communities")
        return {
            "algorithm": "louvain",
            "num_communities": len(communities),
            "community_sizes": [len(comm) for _, comm in sorted_communities[:10]],
            "top_communities": sorted_communities[:5],
            "node_communities": partition
        }
    except ImportError:
        print("Louvain algorithm not available, using connected components instead...")
        # Fallback to connected components
        components = list(nx.connected_components(undirected_G))
        sorted_components = sorted(components, key=len, reverse=True)

        print(f"✅ Detected {len(components)} connected components")
        return {
            "algorithm": "connected_components",
            "num_communities": len(components),
            "community_sizes": [len(comp) for comp in sorted_components[:10]],
            "top_communities": [(i, list(comp)) for i, comp in enumerate(sorted_components[:5])],
            "node_communities": {node: i for i, comp in enumerate(components) for node in comp}
        }

# Try to install the community detection library
try:
    install_and_import("python-louvain")
except:
    print("Could not install python-louvain. Will use connected components instead.")

# Run community detection
community_analysis = detect_communities(amazon_graph)

# Print community findings
print("\nCommunity Detection Results:")
print(f"Algorithm used: {community_analysis['algorithm']}")
print(f"Number of communities/clusters detected: {community_analysis['num_communities']}")
print(f"Top 5 community sizes: {community_analysis['community_sizes'][:5]}")


Performing community detection...
Installing python-louvain...
✅ Successfully installed python-louvain
Graph is large (26520 nodes), sampling 5000 nodes for community detection...
Running Louvain community detection...
✅ Detected 4 communities

Community Detection Results:
Algorithm used: louvain
Number of communities/clusters detected: 4
Top 5 community sizes: [34, 24, 24, 19]


In [27]:
print("\nChecking ArangoDB connection...")

def setup_arangodb():
    """Setup ArangoDB connection safely"""
    try:
        client = ArangoClient(hosts="http://localhost:8529")
        try:
            db = client.db('amazon_db', username='root', password='password')
            print("✅ Connected to ArangoDB with provided credentials")
        except:
            # If default credentials fail, create the database
            sys_db = client.db('_system', username='root', password='password')
            if not sys_db.has_database('amazon_db'):
                sys_db.create_database('amazon_db')
                print("✅ Created amazon_db database")
            db = client.db('amazon_db', username='root', password='password')
        return db
    except Exception as e:
        print(f"⚠️ Error connecting to ArangoDB: {e}")
        print("⚠️ Graph will not be persisted to database")
        return None

db = setup_arangodb()

def persist_amazon_graph(G, db):
    """Save Amazon graph into ArangoDB"""
    if db is None:
        return False

    try:
        # Create collections
        nodes_collection = "amazon_products"
        edges_collection = "amazon_copurchase"

        if not db.has_collection(nodes_collection):
            db.create_collection(nodes_collection)
            print(f"✅ Created collection: {nodes_collection}")

        if not db.has_collection(edges_collection):
            db.create_collection(edges_collection, edge=True)
            print(f"✅ Created edge collection: {edges_collection}")

        # Insert nodes in batches
        products_collection = db.collection(nodes_collection)
        batch_size = 1000
        total_nodes = G.number_of_nodes()

        print(f"Inserting {total_nodes} nodes in batches of {batch_size}...")
        nodes_list = list(G.nodes(data=True))

        for i in tqdm(range(0, total_nodes, batch_size), desc="Inserting node batches"):
            batch = nodes_list[i:i+batch_size]
            nodes_batch = []

            for node, attrs in batch:
                node_doc = {"_key": str(node).replace("/", "_")}
                node_doc.update(attrs)
                nodes_batch.append(node_doc)

            # Import batch
            products_collection.import_bulk(nodes_batch, on_duplicate="update")

        # Insert edges in batches
        copurchase_collection = db.collection(edges_collection)
        total_edges = G.number_of_edges()

        print(f"Inserting {total_edges} edges in batches of {batch_size}...")
        edges_list = list(G.edges())

        for i in tqdm(range(0, total_edges, batch_size), desc="Inserting edge batches"):
            batch = edges_list[i:i+batch_size]
            edges_batch = []

            for source, target in batch:
                source_key = str(source).replace("/", "_")
                target_key = str(target).replace("/", "_")
                edges_batch.append({
                    "_from": f"{nodes_collection}/{source_key}",
                    "_to": f"{nodes_collection}/{target_key}"
                })

            # Import batch
            copurchase_collection.import_bulk(edges_batch, on_duplicate="update")

        print(f"✅ Successfully persisted Amazon graph to ArangoDB")
        return True
    except Exception as e:
        print(f"⚠️ Error persisting graph: {e}")
        return False

# Only persist if database is available and user confirms
if db is not None:
    # Uncomment the line below to persist (can be slow for large graphs)
    # persist_amazon_graph(amazon_graph, db)
    print("ArangoDB persistence ready, but skipped for performance reasons.")
    print("You can uncomment the persistence code to enable it.")


Checking ArangoDB connection...
✅ Connected to ArangoDB with provided credentials
ArangoDB persistence ready, but skipped for performance reasons.
You can uncomment the persistence code to enable it.


In [33]:
print("\nSetting up LangChain for graph insights with Cohere...")
from langchain import PromptTemplate, LLMChain
from langchain.llms import Cohere
import os

def setup_langchain_cohere(graph_analysis, community_analysis):
    """Sets up LangChain with the Cohere API."""
    try:
        if 'COHERE_API_KEY' in os.environ:
            print("✅ Using Cohere")
            query_template = PromptTemplate(
                template="""
            Based on the network analysis:

            Graph has {num_nodes} nodes and {num_edges} edges
            Average degree: {avg_degree:.2f}
            Max degree: {max_degree}
            Communities detected: {num_communities}
            Community Sizes: {community_sizes}
            Top Products by Degree: {top_nodes_by_degree}

            Query: {query}

            Answer:
            """,
                input_variables=["query", "num_nodes", "num_edges", "avg_degree", "max_degree", "num_communities", "community_sizes", "top_nodes_by_degree"]
            )
            cohere_llm = Cohere() # Initialize Cohere LLM
            return LLMChain(llm=cohere_llm, prompt=query_template)

        else:
            raise ValueError("COHERE_API_KEY environment variable not set.") # Raise error if API key is missing


    except Exception as e:
        print(f"⚠️ Error setting up LangChain: {e}")
        return None


def agentic_query(query, llm_chain, graph_analysis, community_analysis):
    """Processes queries about the graph using the LLM chain."""
    if llm_chain is None:
        return "LLM chain not available. Please check setup."
    return llm_chain.run({"query": query, **graph_analysis, **community_analysis})


# Example graph and community analysis data (replace with your actual data)
graph_analysis = {
    "num_nodes": 100,
    "num_edges": 200,
    "avg_degree": 2.5,
    "max_degree": 10,
    "top_nodes_by_degree": [(1, 10), (2, 9)],
}
community_analysis = {
    "num_communities": 5,
    "community_sizes": [20, 15, 12, 34, 19]
}

try:
  #setup langchain with Cohere
  llm_chain = setup_langchain_cohere(graph_analysis, community_analysis)

  # Example queries
  queries = [
      "What is the most influential product?",
      "How many communities are there?",
      "What is the structure of the network?",
      "Give me a product recommendation."
  ]

  for query in queries:
      response = agentic_query(query, llm_chain, graph_analysis, community_analysis)
      print(f"Query: {query}\nAnswer: {response}\n")

except ValueError as e:
    print(f"Error: {e}") #specifically handle missing API key error


Setting up LangChain for graph insights with Cohere...
✅ Using Cohere


  return LLMChain(llm=cohere_llm, prompt=query_template)
  return llm_chain.run({"query": query, **graph_analysis, **community_analysis})


Query: What is the most influential product?
Answer: The product (1,10) is the most influential product in the graph according to the provided network analysis. Product (1,10) has a high degree and is central to the network, meaning it connects many communities and is a top product in terms of connectivity. This could indicate that Product (1,10) is popular, widely used, or referenced, making it influential in driving connections within the network. 

Query: How many communities are there?
Answer: The graph has been detected to have 5 communities. 

Query: What is the structure of the network?
Answer: The network has a clear community structure, a graph theorem which refers to clusters of nodes and the edge paths within them. In this case, it has been detected as having 5 communities, with varying sizes and degrees. 

One node, labeled as 1, is the single hub in its community, while another node, labeled as 2, is the authority in its respective community. Authority nodes tend to connec

In [32]:
import os
os.environ["COHERE_API_KEY"] = "WcXnR3lxNWGwnoJmI2hq8CnCmPfAr8fRbFFacCsT"
!pip install cohere

Collecting cohere
  Downloading cohere-5.13.12-py3-none-any.whl.metadata (3.4 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20241016-py3-none-any.whl.metadata (1.9 kB)
Downloading cohere-5.13.12-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.9/252.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading types_requests-2.32.0.20241016-py3-none-any.whl (15 kB)
Installing collected packages: types-requests, fastavro, cohere
Successfully installed cohere-5.13.12 fastavro-1.10.0 types-requests-2.32.0.20241016


In [35]:
import time
import os
from langchain import PromptTemplate, LLMChain
from langchain.llms import Cohere

def setup_langchain_cohere(graph_analysis, community_analysis):
    """Sets up LangChain with the Cohere API."""
    try:
        if 'COHERE_API_KEY' in os.environ:
            print("✅ Using Cohere")
            query_template = PromptTemplate(
                template="""
            Based on the network analysis:

            Graph has {num_nodes} nodes and {num_edges} edges
            Average degree: {avg_degree:.2f}
            Max degree: {max_degree}
            Communities detected: {num_communities}
            Community Sizes: {community_sizes}
            Top Products by Degree: {top_nodes_by_degree}

            Query: {query}

            Answer:
            """,
                input_variables=["query", "num_nodes", "num_edges", "avg_degree", "max_degree", "num_communities", "community_sizes", "top_nodes_by_degree"]
            )
            cohere_llm = Cohere()
            return LLMChain(llm=cohere_llm, prompt=query_template)

        else:
            raise ValueError("COHERE_API_KEY environment variable not set.")

    except Exception as e:
        print(f"⚠️ Error setting up LangChain: {e}")
        return None

def agentic_query(query, llm_chain, graph_analysis, community_analysis):
    """Processes queries about the graph using the LLM chain."""
    if llm_chain is None:
        return "LLM chain not available. Please check setup."
    return llm_chain.run({"query": query, **graph_analysis, **community_analysis})

# Example graph and community analysis data (replace with your actual data)
graph_analysis = {
    "num_nodes": 100,
    "num_edges": 200,
    "avg_degree": 2.5,
    "max_degree": 10,
    "top_nodes_by_degree": [(1, 10), (2, 9)],
}
community_analysis = {
    "num_communities": 5,
    "community_sizes": [20, 15, 12, 34, 19]
}

try:
  #setup langchain with Cohere
  llm_chain = setup_langchain_cohere(graph_analysis, community_analysis)

  print("\nTesting agentic queries on Amazon graph...")
  examples = [
      "What are the most influential products in the Amazon network?",
      "What insights can we gain from the community structure?",
      "How can this graph be used for product recommendations?",
      "What does the network structure tell us about Amazon's marketplace?"
  ]

  for example in examples:
      print(f"\nQuery: {example}")
      time.sleep(1)
      result = agentic_query(example, llm_chain, graph_analysis, community_analysis) #fixed by providing the other three variables
      print(f"Result: {result}")

  print("\n✅ Amazon SNAP Graph Analysis complete!")

except ValueError as e:
    print(f"Error: {e}")

✅ Using Cohere

Testing agentic queries on Amazon graph...

Query: What are the most influential products in the Amazon network?
Result: Based on the provided network analysis of Amazon's product network, the top two most influential products (by degree) are product IDs 1 and 2, with 9 and 10 reviews respectively. These products have the highest degree centrality among all items in the network, meaning they have been reviewed the most often. This could suggest that these products are among the most popular and well-reviewed items on Amazon, generating a significant amount of discussion and feedback from customers. 

It's important to note that degree centrality is just one aspect of network analysis, and other metrics such as betweenness and eigenvector centrality can also influence the overall influence and importance of a node within the network. However, based solely on the information provided, Products 1 and 2 stand out as the most prominently reviewed items. 

It would be benefic

In [38]:
import networkx as nx
import os
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_cohere import ChatCohere

In [40]:
!pip install langchain_cohere streamlit

Collecting streamlit
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.2-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m89.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[

In [41]:
import sys
import subprocess
import importlib
import networkx as nx
from arango import ArangoClient
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import pandas as pd
import json
import gdown
import requests
from tqdm import tqdm
import gzip
import time
import streamlit as st
import matplotlib.pyplot as plt  # Import matplotlib for visualizations

# ----------------------------------------------------------------------------
# 1. Dependency Installation (Using Jupyter-Friendly Method)
# ----------------------------------------------------------------------------

def install_and_import(package):
    installed = True #boolean to store whether the required packages are installed

    try:
        importlib.import_module(package)
        print(f"✅ {package} is already installed")
    except ImportError:
        try:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"✅ Successfully installed {package}")
            importlib.import_module(package) # Check if it can be imported after installation
        except Exception as e:
            print(f"⚠️ Error installing {package}: {e}")
            print("Skipping this package.")
            installed = False

    return installed

#required packages
required_packages = ["python-arango", "networkx", "pandas", "gdown", "requests", "tqdm", "streamlit", "matplotlib"]
langchain_packages = ["langchain", "langchain-openai", "langchain-community"]
all_packages = required_packages + langchain_packages
installed_all_packages = True
missing_packages = []

for package in all_packages:
    try:
        importlib.import_module(package)
        print(f"✅ {package} is already installed")
    except ImportError:
        missing_packages.append(package)
        installed_all_packages = False # We don't know the results yet.

if not installed_all_packages:
    print("Installing all missing packages in one go...")
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_packages)
        print("✅ Successfully installed all missing packages")
        for package in missing_packages:
            try:
                 importlib.import_module(package)  # Double check if it can be imported now.
            except:
                 print(f"Failed to import {package}, there might be a dependency error.")
                 installed_all_packages = False

    except Exception as e:
        print(f"⚠️ Error installing packages: {e}")
        print("Skipping graph analysis and web interface setup.")
        installed_all_packages = False # Something failed to install so we return false.


✅ networkx is already installed
✅ pandas is already installed
✅ gdown is already installed
✅ requests is already installed
✅ tqdm is already installed
✅ streamlit is already installed
✅ matplotlib is already installed
✅ langchain is already installed
Installing all missing packages in one go...
✅ Successfully installed all missing packages
Failed to import python-arango, there might be a dependency error.
Failed to import langchain-openai, there might be a dependency error.
Failed to import langchain-community, there might be a dependency error.


In [43]:
def main():
    import streamlit as st
    import networkx as nx
    import matplotlib.pyplot as plt
    import pandas as pd
    import time
    import os
    from langchain import PromptTemplate, LLMChain
    from langchain.llms import Cohere

    try: # To catch missing variables if any setup issues occur
      # Load your datasets and process the graph as previously done
      amazon_graph = nx.DiGraph([(i, (i+1) % 100) for i in range(100)])
    except Exception as e:
      print(f"Error creating test graph: {e}")
      exit() # Exit due to essential setup failure

    from itertools import islice # for graph sampling

    # Functions (Copied from previous responses, please note these are just examples)
    def analyze_graph(G):
        """Analyzes a graph and returns various metrics."""
        analysis = {}
        analysis["num_nodes"] = G.number_of_nodes()
        analysis["num_edges"] = G.number_of_edges()
        degrees = [d for n, d in G.degree()]
        analysis["avg_degree"] = sum(degrees) / len(degrees) if degrees else 0
        analysis["max_degree"] = max(degrees) if degrees else 0
        degree_dict = dict(G.degree())
        top_nodes = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:10]
        analysis["top_nodes_by_degree"] = top_nodes

        # Find largest weakly connected component
        connected_components = list(nx.weakly_connected_components(G))
        if connected_components:
            largest_cc = max(connected_components, key=len)
            analysis["largest_cc_size"] = len(largest_cc)
            analysis["largest_cc_percentage"] = len(largest_cc) / G.number_of_nodes() * 100
        else:
            analysis["largest_cc_size"] = 0
            analysis["largest_cc_percentage"] = 0

        # Sample a small subgraph for visualization and detailed analysis
        if top_nodes:
            seed_node = top_nodes[0][0]
            sample_nodes = set([seed_node])
            frontier = set([seed_node])
            while len(sample_nodes) < 100 and frontier:
                new_frontier = set()
                for node in frontier:
                    neighbors = set(G.neighbors(node))
                    new_nodes = neighbors - sample_nodes
                    sample_nodes.update(list(new_nodes)[:5])
                    new_frontier.update(list(new_nodes)[:5])
                    if len(sample_nodes) >= 100:
                        break
                frontier = new_frontier
            sample_subgraph = G.subgraph(sample_nodes)
            analysis["sample_subgraph_nodes"] = list(sample_nodes)  # Store nodes instead of subgraph
            analysis["sample_subgraph_size"] = sample_subgraph.number_of_nodes()
        else:
            analysis["sample_subgraph_nodes"] = []
            analysis["sample_subgraph_size"] = 0

        return analysis

    def detect_communities(G, graph_analysis=None, max_nodes=5000):
        """Detects communities within the graph using Louvain or connected components."""
        # Handle large graphs by sampling
        try: #this could give an error, if no data is there for example
            if G.number_of_nodes() > max_nodes:
                print(f"Graph is large ({G.number_of_nodes()} nodes), sampling {max_nodes} nodes for community detection...")
                if graph_analysis and "sample_subgraph_nodes" in graph_analysis:
                    subgraph = G.subgraph(graph_analysis["sample_subgraph_nodes"])
                else:
                    # Sample nodes if no sample subgraph is available
                    subgraph = G.subgraph(list(G.nodes())[:max_nodes])
            else:
                subgraph = G

            # Convert to undirected for community detection
            undirected_G = subgraph.to_undirected()

            try:
                # Try using Louvain algorithm
                import community as community_louvain
                partition = community_louvain.best_partition(undirected_G)
                communities = {}
                for node, community_id in partition.items():
                    if community_id not in communities:
                        communities[community_id] = []
                    communities[community_id].append(node)
                sorted_communities = sorted(communities.items(), key=lambda x: len(x[1]), reverse=True)
                return {
                    "algorithm": "louvain",
                    "num_communities": len(communities),
                    "community_sizes": [len(comm) for _, comm in sorted_communities[:10]],
                    "top_communities": sorted_communities[:5],
                    "node_communities": partition,
                }
            except ImportError:
                print("Louvain algorithm not available, using connected components instead...")
                # Fallback to connected components
                components = list(nx.connected_components(undirected_G))
                sorted_components = sorted(components, key=len, reverse=True)
                return {
                    "algorithm": "connected_components",
                    "num_communities": len(components),
                    "community_sizes": [len(comp) for comp in sorted_components[:10]],
                    "top_communities": [(i, list(comp)) for i, comp in enumerate(sorted_components[:5])],
                    "node_communities": {node: i for i, comp in enumerate(components) for node in comp}
                }
        except:
            print("Can't do community setup, maybe there is no information in this set?")
            return{}

    def setup_langchain_cohere(graph_analysis, community_analysis):
        """Sets up LangChain with the Cohere API."""
        try:
            # Check if Cohere API key is set
            cohere_api_key = os.environ.get('COHERE_API_KEY')
            if cohere_api_key:
                print("✅ Using Cohere")
                query_template = PromptTemplate(
                    template="""
                Based on the network analysis:
                Graph has {num_nodes} nodes and {num_edges} edges
                Average degree: {avg_degree:.2f}
                Max degree: {max_degree}
                Communities detected: {num_communities}
                Community Sizes: {community_sizes}
                Top Products by Degree: {top_nodes_by_degree}
                Query: {query}
                Answer:
                """,
                    input_variables=["query", "num_nodes", "num_edges", "avg_degree", "max_degree",
                                    "num_communities", "community_sizes", "top_nodes_by_degree"]
                )
                cohere_llm = Cohere(cohere_api_key=cohere_api_key)
                return LLMChain(llm=cohere_llm, prompt=query_template)
            else:
                # Raise error if API key is not set
                raise ValueError("COHERE_API_KEY environment variable not set.")
        except Exception as e:
            print(f"⚠️ Error setting up LangChain: {e}")
            return None

    def agentic_query(query, llm_chain, graph_analysis, community_analysis):
        """Processes queries about the graph using the LLM chain."""
        if llm_chain is None:
            return "LLM chain not available. Please check setup."

        # Create a new dictionary with all parameters
        params = {
            "query": query
        }
        # Add graph analysis parameters
        for key, value in graph_analysis.items():
            if isinstance(value, (str, int, float, list, tuple, dict)) and key != "sample_subgraph_nodes":
                params[key] = value

        # Add community analysis parameters
        for key, value in community_analysis.items():
            if isinstance(value, (str, int, float, list, tuple, dict)) and key != "node_communities":
                params[key] = value

        # Run the chain with the prepared parameters
        return llm_chain.run(**params)

    graph_analysis = analyze_graph(amazon_graph)
    community_analysis = detect_communities(amazon_graph, graph_analysis)
    print(graph_analysis)
    print(community_analysis)
    # ----------------------------------------------------------------------------
    # 4. Streamlit Application
    # ----------------------------------------------------------------------------

    st.title("Amazon Product Network Analysis")

    st.sidebar.header("Graph Statistics")
    st.sidebar.write(f"Total Products (Nodes): {graph_analysis.get('num_nodes'):,}")
    st.sidebar.write(f"Total Co-Purchase Links (Edges): {graph_analysis.get('num_edges'):,}")
    st.sidebar.write(f"Average Connections per Product: {graph_analysis.get('avg_degree'):.2f}")
    st.sidebar.write(f"Maximum Connections for a Product: {graph_analysis.get('max_degree')}")
    st.sidebar.write(f"Largest Connected Component: {graph_analysis.get('largest_cc_percentage'):.2f}%")

    st.sidebar.header("Community Statistics")
    st.sidebar.write(f"Number of Communities: {community_analysis.get('num_communities', 'N/A')}")
    st.sidebar.write("Top 5 Community Sizes:")
    if "community_sizes" in community_analysis:
        for i, size in enumerate(community_analysis['community_sizes'][:5]):
            st.sidebar.write(f"{i+1}: " + str(size))

    # Visualization - using matplotlib for simplicity
    st.header("Graph Visualization")
    st.write("Displaying a sample subgraph for visualization")
    if 'sample_subgraph' in graph_analysis and graph_analysis['sample_subgraph']:
        fig, ax = plt.subplots()
        nx.draw(graph_analysis["sample_subgraph"], with_labels=True, ax=ax)
        st.pyplot(fig)  # st.pyplot for matplotlib plots. If using plotly or other library you will use different command.
    else:
        st.write("No sample subgraph available.")

            # LLM-powered Insights Section
    st.header("LLM-Powered Insights")

    llm_chain = setup_langchain_cohere(graph_analysis, community_analysis)

    if llm_chain:
        query = st.text_input("Enter your query about the Amazon network:")
        if query:
            result = agentic_query(query, llm_chain, graph_analysis, community_analysis)
            st.write("LLM Answer:", result)
    else:
        st.error("Failed to set up the LLM Chain. Check your API key and settings.")
if __name__ == "__main__":
    main()



{'num_nodes': 100, 'num_edges': 100, 'avg_degree': 2.0, 'max_degree': 2, 'top_nodes_by_degree': [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, 2), (6, 2), (7, 2), (8, 2), (9, 2)], 'largest_cc_size': 100, 'largest_cc_percentage': 100.0, 'sample_subgraph_nodes': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'sample_subgraph_size': 100}
{'algorithm': 'louvain', 'num_communities': 11, 'community_sizes': [12, 11, 10, 10, 9, 9, 9, 9, 8, 7], 'top_communities': [(8, [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]), (9, [87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97]), (5, [23, 24, 25, 26, 27, 28, 29, 30, 31, 32]), (10, [45, 46, 47, 48, 49, 50, 51, 52, 53, 54

2025-02-25 09:32:45.033 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


✅ Using Cohere


2025-02-25 09:32:45.861 Session state does not function when running a script without `streamlit run`


In [61]:
import gradio as gr
import networkx as nx
import matplotlib.pyplot as plt
import io
import base64
import os
import requests
import gzip
from tqdm import tqdm
from dataclasses import dataclass
from typing import List, Tuple, Optional, Dict

@dataclass
class GraphAnalysis:
    num_nodes: int
    num_edges: int
    avg_degree: float
    max_degree: int
    top_nodes_by_degree: List[Tuple[str, int]]
    largest_cc_size: int
    largest_cc_percentage: float
    sample_subgraph: Optional[nx.DiGraph]
    sample_subgraph_size: int
    image: str

def visualize_graph(graph: Optional[nx.DiGraph]) -> str:
    """Visualizes the graph (or a sample) and returns a base64 encoded image."""
    if graph is None:
        return ""

    # Always sample a subgraph for visualization to keep it simple and fast
    if graph.number_of_nodes() > 100:
        top_nodes = sorted(graph.degree(), key=lambda x: x[1], reverse=True)[:10]  #Top 10
        seed_node = top_nodes[0][0]
        sample_nodes = {seed_node}
        frontier = {seed_node}
        while len(sample_nodes) < 100 and frontier:
            new_frontier = set()
            for node in frontier:
                neighbors = set(graph.neighbors(node))
                new_nodes = (neighbors - sample_nodes)
                selected_nodes = list(new_nodes)[:5]  # Limit to 5 neighbors
                sample_nodes.update(selected_nodes)
                new_frontier.update(selected_nodes)
                if len(sample_nodes) >= 100:
                    break
            frontier = new_frontier
        graph = graph.subgraph(list(sample_nodes))


    plt.figure(figsize=(12, 6))  # Adjust figure size as needed
    nx.draw(graph, with_labels=True, font_weight='bold', node_size=400, font_size=9, alpha=0.7) #Keep small
    plt.title("Generated Graph (Sample)")  # Clarify it's a sample
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close()
    return base64.b64encode(buf.getvalue()).decode('utf-8')

def analyze_graph(graph: Optional[nx.DiGraph]) -> Optional[GraphAnalysis]:
    """Analyzes the graph and returns metrics, including a sampled image."""
    if graph is None:
        return None

    analysis = {
        "num_nodes": graph.number_of_nodes(),
        "num_edges": graph.number_of_edges(),
    }
    degrees = [d for _, d in graph.degree()]
    analysis["avg_degree"] = sum(degrees) / len(degrees) if degrees else 0.0
    analysis["max_degree"] = max(degrees) if degrees else 0
    top_nodes = sorted(graph.degree(), key=lambda x: x[1], reverse=True)[:10] #Top 10
    analysis["top_nodes_by_degree"] = top_nodes

    connected_components = list(nx.weakly_connected_components(graph))
    if connected_components:
        largest_cc = max(connected_components, key=len)
        analysis["largest_cc_size"] = len(largest_cc)
        analysis["largest_cc_percentage"] = (len(largest_cc) / graph.number_of_nodes()) * 100
    else:
        analysis["largest_cc_size"] = 0
        analysis["largest_cc_percentage"] = 0.0

    analysis["sample_subgraph"] = None  # We'll *always* create the sample now
    analysis["sample_subgraph_size"] = 0
    analysis["image"] = visualize_graph(graph)  # Get sampled image

    return GraphAnalysis(**analysis)


def download_file(url: str, filename: str) -> Optional[str]:
    """Downloads a file with a progress bar."""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        total_size = int(response.headers.get('content-length', 0))
        with open(filename, 'wb') as file, tqdm(
            desc=filename, total=total_size, unit='iB', unit_scale=True, unit_divisor=1024
        ) as bar:
            for data in response.iter_content(1024):
                file.write(data)
                bar.update(len(data))
        print(f"✅ Downloaded {filename}")
        return filename
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error downloading {filename}: {e}")
        return None



def parse_amazon_copurchase(gz_file: str) -> Optional[pd.DataFrame]:
    """Parse Amazon co-purchasing network data."""
    print(f"Parsing co-purchase network from {gz_file}...")
    edges = []
    try:
        with gzip.open(gz_file, 'rt', encoding='latin1') as f:
            for line in tqdm(f, desc="Reading edges"):
                if not line.startswith('#'):
                    source, target = line.strip().split()
                    edges.append((source, target))
        print(f"✅ Parsed {len(edges)} co-purchase edges")
        df = pd.DataFrame(edges, columns=['source', 'target'])
        csv_file = gz_file.replace('.gz', '.csv')
        df.to_csv(csv_file, index=False)
        print(f"✅ Saved to {csv_file}")
        return df
    except Exception as e:
        print(f"⚠️ Error parsing co-purchase data: {e}")
        return None

def load_graph(copurchase_df: Optional[pd.DataFrame]) -> Optional[nx.DiGraph]:
    """Loads the graph from DataFrames."""
    if copurchase_df is None:
        print("⚠️ Copurchase DataFrame is None.")
        return None
    try:
        graph = nx.DiGraph()
        with tqdm(total=len(copurchase_df), desc="Adding edges") as pbar:
            for _, row in copurchase_df.iterrows():
                graph.add_edge(str(row['source']), str(row['target']))
                pbar.update(1)
        print(f"✅ Created graph: {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges")
        return graph
    except Exception as e:
        print(f"⚠️ Error loading graph: {e}")
        return None


def process_data(max_nodes_to_display: int = 1000) -> Dict[str, str]:
    """Downloads, parses, and analyzes the graph, returning results for Gradio."""

    amazon_datasets = {
        "copurchase": "http://snap.stanford.edu/data/amazon0601.txt.gz",
    }
    data_dir = "amazon_data"
    os.makedirs(data_dir, exist_ok=True)


    copurchase_file = os.path.join(data_dir, "amazon0601.txt.gz")
    copurchase_csv_file = copurchase_file.replace('.gz', '.csv')
    if os.path.exists(copurchase_csv_file):
        print("Using existing copurchase CSV.")
        copurchase_df = pd.read_csv(copurchase_csv_file)
    else:
        if not os.path.exists(copurchase_file):
            download_file(amazon_datasets["copurchase"], copurchase_file)
        copurchase_df = parse_amazon_copurchase(copurchase_file)
        if copurchase_df is None:
            return {
                "graph_summary": "Error: Could not load co-purchase data.",
                "graph_visualization": "",
                "status": "Data loading error."
            }

    amazon_graph = load_graph(copurchase_df)
    if amazon_graph is None:
        return {
            "graph_summary": "Error: Could not create graph.",
            "graph_visualization": "",
            "status": "Graph creation error."
        }

    graph_analysis = analyze_graph(amazon_graph)
    if graph_analysis is None:
        return {
            "graph_summary": "Error: Graph analysis failed.",
            "graph_visualization": "",
            "status": "Graph analysis error."
        }

    # Create a concise summary for the text output
    summary = (
        f"The graph has {graph_analysis.num_nodes} nodes and {graph_analysis.num_edges} edges.\n"
        f"Average degree: {graph_analysis.avg_degree:.2f}, Max degree: {graph_analysis.max_degree}.\n"
        f"Largest connected component size: {graph_analysis.largest_cc_size} "
        f"({graph_analysis.largest_cc_percentage:.2f}% of nodes).\n"
        f"Top nodes by degree: {graph_analysis.top_nodes_by_degree[:5]}" # Top 5

    )

    # Limit the graph displayed
    if graph_analysis.num_nodes > max_nodes_to_display:
      summary += f"\n\nDisplaying a sample of up to {max_nodes_to_display} nodes."

    return {
        "graph_summary": summary,
        "graph_visualization": graph_analysis.image,  # Always a sampled/limited image
        "status": "Graph analysis complete!",
    }

# --- Gradio Interface Setup ---
inputs = [
    gr.Slider(minimum=100, maximum=10000, value=1000, step=100, label="Max Nodes to Display", key="max_nodes_to_display")
]
outputs = [
    gr.Textbox(label="Graph Summary", key="graph_summary"),
    gr.HTML(label="Graph Visualization", key="graph_visualization"),
    gr.Textbox(label="Status", key="status"),
]

iface = gr.Interface(
    fn=process_data,
    inputs=inputs,
    outputs=outputs,
    title="Amazon Graph Analysis (Simplified)",
    description="Analyzes the Amazon product co-purchasing network and displays a simplified graph.",
    allow_flagging="never",  # Prevent flagging, since we don't have user input
)

if __name__ == "__main__":
    iface.launch(debug=False)



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8bd2aa29efd996b207.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [45]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.18.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta