In [None]:
import openai

# Set up your Azure OpenAI credentials
azure_endpoint = "your az endpoint"
api_key = "key"
api_version = "2024-10-21"
deployment_name = "gpt-4o"  # Your deployment name

# Create the OpenAI client
client = openai.AzureOpenAI(
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version
)

# Make the API call
response = client.chat.completions.create(
    model=deployment_name,  # Use 'model' instead of 'engine'
    messages=[{"role": "user", "content": "Hello, how are you?"}],
    temperature=0.7,
    max_tokens=100
)

# Print response
print(response.choices[0].message.content)


Hello! I'm just a computer program, so I don't have feelings, but I'm here and ready to help you with whatever you need. How about you—how are *you* doing today? 😊


In [46]:
import os
import ast
import networkx as nx
import openai
import pickle
import time
import json
from concurrent.futures import ThreadPoolExecutor

def extract_code_structure(file_path, G, current, total, method_count):
    """Parses a Python file and extracts classes, methods, and relationships into a DAG."""
    start_time = time.time()
    print(f"[INFO] Processing file {current}/{total}: {file_path}")
    
    with open(file_path, "r", encoding="utf-8") as f:
        tree = ast.parse(f.read())

    parent_stack = []
    
    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef):
            code_snippet = ast.unparse(node)
            G.add_node(node.name, type="class", metadata=code_snippet, file=file_path)
            parent_stack.append(node.name)
            
            for base in node.bases:
                if isinstance(base, ast.Name):  
                    G.add_edge(base.id, node.name, relation="inherits")
        
        elif isinstance(node, ast.FunctionDef):
            if parent_stack:
                class_name = parent_stack[-1]
                code_snippet = ast.unparse(node)
                G.add_node(node.name, type="method", metadata=code_snippet, file=file_path)
                G.add_edge(class_name, node.name, relation="contains")
                method_count.append(node.name)
        
        elif isinstance(node, ast.Call):
            caller = parent_stack[-1] if parent_stack else None
            
            if hasattr(node.func, 'id'):
                callee = node.func.id
            elif hasattr(node.func, 'attr') and hasattr(node.func, 'value') and isinstance(node.func.value, ast.Name):
                callee = f"{node.func.value.id}.{node.func.attr}"
            else:
                callee = None

            if caller and callee:
                G.add_edge(caller, callee, relation="calls")
    
    elapsed_time = time.time() - start_time
    print(f"[INFO] Completed {file_path} in {elapsed_time:.2f} seconds.")
    return G

In [47]:
def analyze_with_gpt4o(batch_methods):
    """Uses GPT-4o to analyze a batch of code snippets."""
    if not batch_methods:
        return {}

    print(f"[INFO] Sending batch of {len(batch_methods)} methods to GPT-4o for analysis...")
    start_time = time.time()

    batch_code_snippets = "\n\n".join([
        f"Method: {name}\nCode:\n{code}" for name, code in batch_methods.items()
    ])
    prompt = f"""
    Analyze the following batch of functions and summarize their purposes:
    {batch_code_snippets}
    """
    
    response = client.chat.completions.create(
        model="gpt-4o",  # Updated for GPT-4o
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5
    )
    
    elapsed_time = time.time() - start_time
    print(f"[INFO] GPT-4o analysis for batch completed in {elapsed_time:.2f} seconds.")

    response_text = response.choices[0].message.content.split("\n\n")
    analysis_results = {name: response_text[i] if i < len(response_text) else "No response"
                        for i, name in enumerate(batch_methods.keys())}
    
    return analysis_results

In [48]:
def process_graph_with_llm(G, batch_size=15):
    """Parallel processing for LLM-based analysis with batching."""
    print("[INFO] Starting GPT-4o analysis for extracted code components...")
    start_time = time.time()
    
    total_methods = len([node for node in G.nodes if G.nodes[node].get("type") == "method"])
    print(f"[INFO] Total methods to analyze: {total_methods}")

    methods = {node: G.nodes[node].get("metadata", "") for node in G.nodes if G.nodes[node].get("type") == "method"}
    method_names = list(methods.keys())

    # Create batches of methods
    batches = [method_names[i:i + batch_size] for i in range(0, len(method_names), batch_size)]

    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_batch = {
            executor.submit(analyze_with_gpt4o, {name: methods[name] for name in batch}): batch
            for batch in batches
        }
        
        for future in future_to_batch:
            batch = future_to_batch[future]
            results = future.result()
            for name in batch:
                G.nodes[name]["metadata"] = results.get(name, "Analysis unavailable.")

    elapsed_time = time.time() - start_time
    print(f"[INFO] GPT-4o analysis phase completed in {elapsed_time:.2f} seconds.")

In [49]:
def generate_documentation(graph, output_path="knowledge_graph_documentation.json"):
    """Generates a detailed documentation from the knowledge graph."""
    documentation = {
        "metadata": {
            "description": "Knowledge graph representation of a Python codebase.",
            "total_nodes": len(graph.nodes),
            "total_edges": len(graph.edges),
        },
        "nodes": [],
        "edges": []
    }

    for node, attributes in graph.nodes(data=True):
        documentation["nodes"].append({
            "name": node,
            "type": attributes.get("type", "unknown"),
            "metadata": attributes.get("metadata", "No metadata available"),
            "file": attributes.get("file", "Unknown file")
        })

    for source, target, attributes in graph.edges(data=True):
        documentation["edges"].append({
            "source": source,
            "target": target,
            "relation": attributes.get("relation", "unknown relation")
        })

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(documentation, f, indent=4)

    print(f"✅ Documentation saved as '{output_path}'")

In [50]:
def build_knowledge_graph(directory):
    """Process all Python files in a directory with progress tracking."""
    start_time = time.time()
    print("[INFO] Scanning directory for Python files...")
    
    G = nx.DiGraph()
    method_count = []
    
    python_files = [os.path.join(root, file) for root, _, files in os.walk(directory) for file in files if file.endswith(".py")]
    total_files = len(python_files)
    print(f"[INFO] Found {total_files} Python files.")
    
    for index, file_path in enumerate(python_files, start=1):
        G = extract_code_structure(file_path, G, index, total_files, method_count)
    
    print(f"[INFO] Total methods found: {len(method_count)}")
    print("[INFO] Starting GPT-4o-based processing of extracted code...")
    
    process_graph_with_llm(G)
    
    elapsed_time = time.time() - start_time
    print(f"[INFO] Knowledge graph construction completed in {elapsed_time:.2f} seconds.")
    
    return G

In [51]:
# **Modify this path to point to your larger codebase**
codebase_path = r"C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master"

# **Generate Knowledge Graph**
print("[INFO] Starting knowledge graph generation...")
global_start_time = time.time()
graph = build_knowledge_graph(codebase_path)

# **Save Graph Using Pickle**
print("[INFO] Saving knowledge graph to file...")
with open("knowledge_graph_large.gpickle", "wb") as f:
    pickle.dump(graph, f)

# **Generate Documentation**
generate_documentation(graph)

total_execution_time = time.time() - global_start_time
print(f"✅ Knowledge graph saved successfully as 'knowledge_graph_large.gpickle'.")
print(f"⏱️ Total execution time: {total_execution_time:.2f} seconds.")

[INFO] Starting knowledge graph generation...
[INFO] Scanning directory for Python files...
[INFO] Found 8 Python files.
[INFO] Processing file 1/8: C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\constants.py
[INFO] Completed C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\constants.py in 0.02 seconds.
[INFO] Processing file 2/8: C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\Streamer.py
[INFO] Completed C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\Streamer.py in 0.02 seconds.
[INFO] Processing file 3/8: C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\StreamViewer.py
[INFO] Completed C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\StreamViewer.py in 0.02 seconds.
[INFO] Processing file 4/8: C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\test_local_streaming.py
[INFO] Completed C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\test_l

In [42]:
def load_graph(file_path):
    """Loads the knowledge graph from a .gpickle file."""
    with open(file_path, "rb") as f:
        return pickle.load(f)

def ask_llm_about_graph(graph, query):
    """Uses GPT-4o to analyze the knowledge graph and respond to queries."""
    nodes_info = []
    edges_info = []

    for node, attributes in graph.nodes(data=True):
        metadata = attributes.get("metadata", "No metadata available")
        file_path = attributes.get("file", "Unknown file")
        nodes_info.append(f"{node} (File: {file_path}) - {metadata}")

    for source, target, attributes in graph.edges(data=True):
        relation = attributes.get("relation", "unknown relation")
        edges_info.append(f"{source} → {target} ({relation})")

    prompt = f"""
    You are given a knowledge graph representing a Python codebase.

    **Nodes in the Graph:**  
    {"\n".join(nodes_info[:50])}  # Limiting to first 50 to avoid exceeding token limits

    **Edges in the Graph:**  
    {"\n".join(edges_info[:50])}  # Limiting to first 50 to avoid exceeding token limits

    **User Query:** "{query}"
    
    Based on the knowledge graph, provide an answer.
    """

    response = client.chat.completions.create(
        model="gpt-4o",  # Using GPT-4o model
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5
    )

    return response.choices[0].message.content

if __name__ == "__main__":
    graph_file = "knowledge_graph_large.gpickle"

    try:
        graph = load_graph(graph_file)
        print("✅ Knowledge graph loaded successfully!")

        print("\n📌 **Nodes in Graph (First 10):**")
        for node, data in list(graph.nodes(data=True))[:50]:
            print(f"🔹 {node} - {data}")

        print("\n📌 **Edges in Graph (First 10):**")
        for source, target, data in list(graph.edges(data=True))[:50]:
            print(f"🔸 {source} → {target} ({data})")

        query = input("\n🔍 Enter your query about the code: ")
        response = ask_llm_about_graph(graph, query)

        print("\n💡 **LLM Response:**")
        print(response)

    except FileNotFoundError:
        print(f"❌ Error: Graph file '{graph_file}' not found. Run the extraction script first.")


✅ Knowledge graph loaded successfully!

📌 **Nodes in Graph (First 10):**
🔹 Streamer - {'type': 'class', 'metadata': 'class Streamer:\n\n    def __init__(self, server_address=SERVER_ADDRESS, port=PORT):\n        """\n        Tries to connect to the StreamViewer with supplied server_address and creates a socket for future use.\n\n        :param server_address: Address of the computer on which the StreamViewer is running, default is `localhost`\n        :param port: Port which will be used for sending the stream\n        """\n        print(\'Connecting to \', server_address, \'at\', port)\n        context = zmq.Context()\n        self.footage_socket = context.socket(zmq.PUB)\n        self.footage_socket.connect(\'tcp://\' + server_address + \':\' + port)\n        self.keep_running = True\n\n    def start(self):\n        """\n        Starts sending the stream to the Viewer.\n        Creates a camera, takes a image frame converts the frame to string and sends the string across the network\n

Create a cosmosDB client

In [52]:
%pip install azure-cosmos

from azure.core.exceptions import AzureError
from azure.cosmos import CosmosClient, PartitionKey
from azure.identity import DefaultAzureCredential

ENDPOINT = "https://knowledgebase.documents.azure.com:443/"

# Uncomment the following lines if you want to use Azure AD authentication
#credential = DefaultAzureCredential()
#client = CosmosClient(ENDPOINT, credential)

# Connect to the Cosmos DB account using the connection string
dbclient = CosmosClient.from_connection_string("AccountEndpoint=https://knowledgebase.documents.azure.com:443/;AccountKey=oH6ZTvZtobQ5frdb4exjk25d1TD9HozSAtyFmlDSe6DNxciwU1k4mYgweCiMOrReP8FRHzXxIC79ACDbZ11YHA==;")

# Create database (if not exists)
database = dbclient.create_database_if_not_exists(id="knowledgebase")

# Create container (if not exists)
vector_embedding_policy = { 
    "vectorEmbeddings": [ 
        { 
            "path": "/embedding", 
            "dataType": "float32", 
            "distanceFunction": "cosine", 
            "dimensions": 10 
        } 
    ]    
}
indexing_policy = { 
    "includedPaths": [ 
        { 
            "path": "/*" 
        } 
    ], 
    "excludedPaths": [ 
        { 
            "path": "/\"_etag\"/?"
        },
        { 
            "path": "/embedding/*"
        }
    ], 
    "vectorIndexes": [ 
        {
            "path": "/embedding", 
            "type": "quantizedFlat" 
        },
    ] 
}

container = database.create_container_if_not_exists(
    id="VectorStore",
    partition_key=PartitionKey(path="/file",
                               indexing_policy=indexing_policy,
                               vector_embedding_policy=vector_embedding_policy)  # Adjust based on your partition strategy
)

Defaulting to user installation because normal site-packages is not writeable
Collecting azure-cosmos
  Downloading azure_cosmos-4.9.0-py3-none-any.whl.metadata (80 kB)
Downloading azure_cosmos-4.9.0-py3-none-any.whl (303 kB)
Installing collected packages: azure-cosmos
Successfully installed azure-cosmos-4.9.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Chunk text and create embeddings


In [None]:
from openai import AzureOpenAI

# Initialize Azure OpenAI client
embeddings_client = AzureOpenAI(
    api_key="key",
    azure_endpoint="endpoint",
    api_version="2024-10-21",
)

def split_text(text, max_length=1000, min_length=500):
    """Splits text into chunks ensuring each chunk is within max and min word limits."""
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) >= max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = []

    if current_chunk and len(' '.join(current_chunk)) >= min_length:
        chunks.append(' '.join(current_chunk))

    return chunks

def create_embeddings(text_chunk, model="text-embedding-ada-002"):
    """Generates embeddings for a given text chunk."""
    response = embeddings_client.embeddings.create(input=text_chunk, model=model)
    return response.data[0].embedding

# Read the detailed documentation file
with open(r"C:\Users\rajrishi\OneDrive - Microsoft\Desktop\detailed_documentation.txt", "r", encoding="utf-8") as file:
    input_text = file.read()

# Split the text into manageable chunks
chunks = split_text(input_text, max_length=1000, min_length=500)
print(f"✅ Number of chunks: {len(chunks)}")
print(f"📝 First chunk: {chunks[0]}")

# Generate embeddings for each chunk
embeddings = [(create_embeddings(chunk), chunk) for chunk in chunks]
print(f"🔹 First embedding generated successfully!")

✅ Number of chunks: 31
📝 First chunk: # **Detailed Documentation for the Code Structure** This documentation provides an in-depth explanation of the code structure and functionality for the `Streamer` class, its methods, and associated components. The `Streamer` class is part of a streaming application designed to capture video frames from a camera, convert them into a string format, and transmit them over a network to a viewer. --- ## **File Information** - **File Path:** `C:\Users\rajrishi\OneDrive - Microsoft\Desktop\SmoothStream-master\Streamer.py` - **Purpose:** Implements the `Streamer` class to handle video streaming functionality. --- ## **Class: Streamer** ### **Overview** The `Streamer` class is responsible for: 1. Establishing a network connection to a viewer application (`StreamViewer`). 2. Capturing video frames from a camera. 3. Sending the video frames over the network in a serialized format. --- ### **Constructor: `__init__`** #### **Definition** ```python def __init__(

In [56]:
# Push the embeddings to Cosmos DB
for embedding, chunk in embeddings:
    try:
        # Create a new document in the container
        container.create_item(
            body={
                'id': str(hash(chunk)),  # Unique ID for the document
                'embedding': embedding,
                'text': chunk,
                'file': "file1"  # Example partition key value
            },
        )
    except AzureError as e:
        print(f"Error creating item: {e}")

In [None]:
import json
from azure.core.exceptions import AzureError
from openai import AzureOpenAI

# OpenAI Configuration
deployment_name = "gpt-4o"  # Ensure this matches your deployment
api_version = "2024-10-21"

# Initialize OpenAI Client
embeddings_client = AzureOpenAI(
    api_key="key",
    azure_endpoint="azep",
    api_version=api_version,
)

def get_nearest_vectors(collection, query_vector, k=5):
    """Retrieve top-k most relevant documentation snippets based on vector similarity."""
    query = '''
        SELECT TOP @k c.text
        FROM c 
        WHERE VectorDistance(c.embedding, @embedding) < 1
    '''
    parameters = [
        {"name": "@embedding", "value": query_vector},
        {"name": "@k", "value": k}
    ]

    results = collection.query_items(query=query, parameters=parameters, enable_cross_partition_query=True)

    relevant_docs = []
    for result in results:
        relevant_docs.append(result["text"])
    
    return relevant_docs

def chatbot():
    """Interactive chatbot loop."""
    while True:
        user_input = input("\nAsk your query (or type 'exit' to quit): ")
        if user_input.lower() == "exit":
            print("Goodbye!")
            break

        query_vector = create_embeddings(user_input)

        # Retrieve relevant documents
        relevant_docs = get_nearest_vectors(container, query_vector, k=5)
        if not relevant_docs:
            print("I couldn't find relevant information in the knowledge base.")
            continue

        # Prepare context
        context = "\n".join(relevant_docs)
        messages = [
            {"role": "system", "content": "You are an AI assistant helping with AI-related queries."},
            {"role": "system", "content": f"Here are some relevant snippets from the knowledge base:\n{context}"},
            {"role": "user", "content": user_input}
        ]

        # Stream response from OpenAI
        response_stream = embeddings_client.chat.completions.create(
            model=deployment_name,
            temperature=0.7,
            max_tokens=800,
            messages=messages,
            stream=True  # Enable streaming
        )

        print("\nResponse:")
        for chunk in response_stream:
            if chunk.choices and chunk.choices[0].delta and "content" in chunk.choices[0].delta:
                print(chunk.choices[0].delta["content"], end="", flush=True)  # Print each token as it arrives
        print()  # Ensure the final output ends with a newline

# Start the chatbot loop
chatbot()



