In [3]:
import os
import re
import requests
import base64
from typing import Dict, List, Optional, Any
from google import genai
from google.genai import types
import json
import logging

# Initialize Gemini client globally
gemini_client = None
github_session = None
github_headers = {}

def initialize_clients(gemini_api_key: str, github_token: Optional[str] = None):
    """Initialize the clients for GitHub API and Gemini."""
    global gemini_client, github_session, github_headers
    
    gemini_client = genai.Client(api_key=gemini_api_key)
    github_session = requests.Session()
    
    # Set up headers for GitHub API
    github_headers = {
        'Accept': 'application/vnd.github.v3+json',
        'User-Agent': 'GitHub-Architecture-Analyzer'
    }
    if github_token:
        github_headers['Authorization'] = f'token {github_token}'

initialize_clients("AIzaSyCADVfX-Zt3EG-W3aD2mSxqyXD0jBEPkVM")

In [4]:
def validate_repository(github_url: str) -> Optional[Dict[str, str]]:
    """Validate and extract repository information from GitHub URL."""
    try:
        # Extract owner and repo name from URL
        pattern = r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$'
        match = re.search(pattern, github_url)
        
        if not match:
            return None
        
        owner, repo_name = match.groups()
        
        # Fetch repository info via GitHub API
        api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
        response = github_session.get(api_url, headers=github_headers)
        
        if response.status_code != 200:
            logging.error(f"GitHub API error: {response.status_code}")
            return None
        
        repo_data = response.json()
        
        return {
            'owner': owner,
            'name': repo_name,
            'full_name': repo_data.get('full_name', f"{owner}/{repo_name}"),
            'description': repo_data.get('description', ''),
            'language': repo_data.get('language', 'Unknown'),
            'stars': repo_data.get('stargazers_count', 0),
            'forks': repo_data.get('forks_count', 0),
            'url': github_url
        }
        
    except Exception as e:
        logging.error(f"Error validating repository: {e}")
        return None


In [5]:
githubval = validate_repository("https://github.com/tushararora-dev/Architecture_Documentation_Generator")
print(githubval)

{'owner': 'tushararora-dev', 'name': 'Architecture_Documentation_Generator', 'full_name': 'tushararora-dev/Architecture_Documentation_Generator', 'description': "This is a GitHub Architecture Documentation Generator built with Streamlit that analyzes GitHub repositories and generates comprehensive architecture documentation using AI. The application leverages Google's Gemini AI API to analyze repository code and structure, then automatically generates professional documentation in PDF and DOCX formats.", 'language': 'Python', 'stars': 0, 'forks': 0, 'url': 'https://github.com/tushararora-dev/Architecture_Documentation_Generator'}


In [8]:
# Supported file extensions for analysis
CODE_EXTENSIONS = {
    '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
    '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
    '.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte',
    '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.md',
    '.txt', '.dockerfile', '.makefile', '.gradle', '.properties'
}

# Files and directories to prioritize for analysis
PRIORITY_FILES = {
    'package.json', 'requirements.txt', 'cargo.toml', 'pom.xml',
    'build.gradle', 'gemfile', 'composer.json', 'setup.py',
    'readme.md', 'architecture.md', 'design.md', 'contributing.md',
    'dockerfile', 'makefile', 'changelog.md', 'license'
}

# Directories and files to skip (not important for architecture)
SKIP_PATTERNS = {
    'test', 'tests', '__tests__', 'spec', 'specs', '__pycache__',
    'node_modules', '.git', '.github', 'dist', 'build', 'target',
    'coverage', '.coverage', '.pytest_cache', '.tox', 'venv', 'env',
    '.env', 'logs', 'tmp', 'temp', '.DS_Store', '.vscode', '.idea',
    'vendor', 'public/assets', 'static/assets', 'assets/images'
}


def fetch_repository_contents(owner: str, repo_name: str, path: str = "") -> List[Dict]:
    """Fetch repository contents for a specific path."""
    try:
        api_url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{path}"
        response = github_session.get(api_url, headers=github_headers)
        
        if response.status_code != 200:
            logging.warning(f"Could not fetch contents for path {path}: {response.status_code}")
            return []
        
        contents = response.json()
        if not isinstance(contents, list):
            contents = [contents]
        
        return contents
        
    except Exception as e:
        logging.warning(f"Error fetching contents for path {path}: {e}")
        return []
    
def should_skip_path(path: str) -> bool:
    """Check if a path should be skipped based on skip patterns."""
    path_lower = path.lower()
    
    # Skip if any part of the path matches skip patterns
    for skip_pattern in SKIP_PATTERNS:
        if skip_pattern in path_lower:
            return True
    
    # Skip common non-code files
    if path_lower.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico',
                           '.pdf', '.zip', '.tar', '.gz', '.exe', '.dmg',
                           '.app', '.deb', '.rpm', '.msi')):
        return True
    
    return False

def get_file_type(filename: str) -> str:
    """Determine file type based on extension."""
    ext = os.path.splitext(filename)[1].lower()
    
    type_mapping = {
        '.py': 'Python',
        '.js': 'JavaScript',
        '.ts': 'TypeScript',
        '.jsx': 'React JSX',
        '.tsx': 'React TSX',
        '.java': 'Java',
        '.cpp': 'C++',
        '.c': 'C',
        '.h': 'Header',
        '.cs': 'C#',
        '.php': 'PHP',
        '.rb': 'Ruby',
        '.go': 'Go',
        '.rs': 'Rust',
        '.swift': 'Swift',
        '.kt': 'Kotlin',
        '.html': 'HTML',
        '.css': 'CSS',
        '.scss': 'SCSS',
        '.json': 'JSON',
        '.md': 'Markdown',
        '.yaml': 'YAML',
        '.yml': 'YAML',
        '.dockerfile': 'Docker',
        '.makefile': 'Makefile'
    }
    
    return type_mapping.get(ext, 'Other')

def fetch_all_repository_files(github_url: str) -> Dict[str, Any]:
    """Fetch all relevant repository files and structure."""
    try:
        # Extract owner and repo name
        pattern = r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$'
        match = re.search(pattern, github_url)
        if not match:
            raise ValueError("Invalid GitHub URL")
            
        owner, repo_name = match.groups()
        
        # Initialize structure
        structure = {
            'files': [],
            'directories': [],
            'key_files': {},
            'statistics': {
                'total_files': 0,
                'code_files': 0,
                'analyzed_files': 0,
                'skipped_files': 0,
                'languages': {}
            }
        }
        
        # Queue for breadth-first traversal
        paths_to_process = [""]
        processed_paths = set()
        
        while paths_to_process:
            current_path = paths_to_process.pop(0)
            
            if current_path in processed_paths:
                continue
            processed_paths.add(current_path)
            
            # Skip if path matches skip patterns
            if current_path and should_skip_path(current_path):
                structure['statistics']['skipped_files'] += 1
                continue
            
            contents = fetch_repository_contents(owner, repo_name, current_path)
            
            for content in contents:
                content_path = content['path']
                
                # Skip if matches skip patterns
                if should_skip_path(content_path):
                    structure['statistics']['skipped_files'] += 1
                    continue
                
                if content['type'] == "dir":
                    structure['directories'].append(content_path)
                    # Add directory to processing queue
                    paths_to_process.append(content_path)
                    
                elif content['type'] == "file":
                    file_info = {
                        'path': content_path,
                        'name': content['name'],
                        'size': content['size'],
                        'type': get_file_type(content['name'])
                    }
                    
                    structure['files'].append(file_info)
                    structure['statistics']['total_files'] += 1
                    
                    # Count file by extension
                    ext = os.path.splitext(content['name'])[1].lower()
                    if ext in CODE_EXTENSIONS:
                        structure['statistics']['code_files'] += 1
                        structure['statistics']['languages'][ext] = structure['statistics']['languages'].get(ext, 0) + 1
                    
                    # Fetch content for relevant files
                    should_analyze = (
                        content['name'].lower() in [f.lower() for f in PRIORITY_FILES] or
                        ext in CODE_EXTENSIONS or
                        content['size'] < 100000  # Files under 100KB
                    )
                    
                    if should_analyze:
                        try:
                            # Get file content
                            file_response = github_session.get(content['download_url'])
                            if file_response.status_code == 200:
                                file_content = file_response.text
                                structure['key_files'][content_path] = {
                                    'content': file_content,
                                    'size': content['size'],
                                    'type': ext
                                }
                                structure['statistics']['analyzed_files'] += 1
                        except Exception as e:
                            logging.warning(f"Could not fetch file {content_path}: {e}")
        
        return structure
        
    except Exception as e:
        logging.error(f"Error fetching repository structure: {e}")
        raise


In [9]:
fetch = fetch_all_repository_files("https://github.com/tushararora-dev/Architecture_Documentation_Generator")
print(fetch)



In [10]:
from pprint import pprint
pprint(fetch)

{'directories': ['.streamlit'],
 'files': [{'name': 'app.py',
            'path': 'app.py',
            'size': 10368,
            'type': 'Python'},
           {'name': 'document_generator_functions.py',
            'path': 'document_generator_functions.py',
            'size': 10763,
            'type': 'Python'},
           {'name': 'github_analyzer_functions.py',
            'path': 'github_analyzer_functions.py',
            'size': 16380,
            'type': 'Python'},
           {'name': 'readme.md',
            'path': 'readme.md',
            'size': 3895,
            'type': 'Markdown'},
           {'name': 'requirements.txt',
            'path': 'requirements.txt',
            'size': 124,
            'type': 'Other'},
           {'name': 'utils.py',
            'path': 'utils.py',
            'size': 5622,
            'type': 'Python'},
           {'name': 'config.toml',
            'path': '.streamlit/config.toml',
            'size': 288,
            'type': 'Other'}],
 '