# Run Streamlit App in SageMaker Studio with Public URL

This notebook starts your Streamlit app and creates a public URL using ngrok.

---

## 📦 Required Files

Make sure you have uploaded:
- `app.py` (your Streamlit app)
- `src/` directory (all modules)
- `requirements.txt`
- This notebook

---

## 🔑 Before You Start

Get your free ngrok token:
1. Visit: https://dashboard.ngrok.com/get-started/your-authtoken
2. Sign up (free)
3. Copy your token
4. Paste it in **Cell 2** below


## 1. Install Dependencies (including py-spy for profiling)


In [None]:
!pip install -q streamlit pyngrok psycopg2-binary pandas pydantic pydantic-settings langchain-ollama python-dotenv py-spy

print("✅ Dependencies installed (including py-spy for flamegraph profiling)")


## 2. Configure Ngrok Token

**⚠️ REPLACE `YOUR_NGROK_TOKEN_HERE` with your token from https://dashboard.ngrok.com**


In [None]:
# REPLACE THIS WITH YOUR ACTUAL NGROK TOKEN
NGROK_TOKEN = "YOUR_NGROK_TOKEN_HERE"

if NGROK_TOKEN == "YOUR_NGROK_TOKEN_HERE":
    print("❌ ERROR: Set your ngrok token above!")
    print("   Get it from: https://dashboard.ngrok.com/get-started/your-authtoken")
else:
    print(f"✅ Ngrok token configured")


## 3. Install and Start Ollama


In [None]:
import subprocess
import time
import os

# Install Ollama if needed
try:
    result = subprocess.run(['which', 'ollama'], capture_output=True, text=True)
    if result.returncode == 0:
        print(f"✅ Ollama already installed")
    else:
        raise FileNotFoundError
except:
    print("📦 Installing Ollama...")
    subprocess.run('curl -fsSL https://ollama.com/install.sh | sh', shell=True)
    print("✅ Ollama installed")

# Stop existing processes
subprocess.run(['pkill', 'ollama'], capture_output=True)
time.sleep(2)

# Start Ollama with optimized settings
print("🚀 Starting Ollama server...")
ollama_env = os.environ.copy()
ollama_env['OLLAMA_NUM_PARALLEL'] = '10'
ollama_env['OLLAMA_MAX_LOADED_MODELS'] = '1'

with open('/tmp/ollama.log', 'w') as log:
    ollama_process = subprocess.Popen(['ollama', 'serve'], 
                                      env=ollama_env,
                                      stdout=log, 
                                      stderr=log)

time.sleep(5)
print(f"✅ Ollama started (PID: {ollama_process.pid})")

# Download model if needed
print("📥 Checking for model...")
result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
if 'qwen2.5:7b-instruct-q4_0' not in result.stdout:
    print("📥 Downloading Qwen 2.5 7B (5-10 min)...")
    subprocess.run(['ollama', 'pull', 'qwen2.5:7b-instruct-q4_0'])
    print("✅ Model downloaded")
else:
    print("✅ Model ready")

print("\\n✅ Ollama ready!")


## 4. Setup SQLite Database


In [None]:
import sqlite3

# Create .env file for app with SQLite configuration
with open('.env', 'w') as f:
    f.write('''DATABASE_URL=sqlite:////tmp/contributor_intelligence.db

OLLAMA_MODEL=qwen2.5:7b-instruct-q4_0
OLLAMA_BASE_URL=http://localhost:11434
''')

print("✅ Environment configured (SQLite mode)")

# Create SQLite database
db_path = '/tmp/contributor_intelligence.db'
conn = sqlite3.connect(db_path)
conn.execute('''
    CREATE TABLE IF NOT EXISTS contributors (
        email TEXT PRIMARY KEY,
        contributor_id TEXT UNIQUE NOT NULL,
        processed_data TEXT NOT NULL,
        intelligence_summary TEXT,
        processing_status TEXT DEFAULT 'pending',
        error_message TEXT,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        intelligence_extracted_at TIMESTAMP
    )
''')
conn.execute('CREATE INDEX IF NOT EXISTS idx_status ON contributors(processing_status)')
conn.commit()
conn.close()

print(f"✅ SQLite database: {db_path}")


## 5. Configure Ngrok


In [None]:
from pyngrok import ngrok, conf

conf.get_default().auth_token = NGROK_TOKEN
print("✅ Ngrok configured")


## 6. Start Streamlit App


In [None]:
# Stop existing Streamlit
subprocess.run(['pkill', '-f', 'streamlit'], capture_output=True)
time.sleep(2)

# Start Streamlit
print("🎨 Starting Streamlit app...")
with open('/tmp/streamlit.log', 'w') as log:
    streamlit_process = subprocess.Popen(
        ['streamlit', 'run', 'app.py', 
         '--server.port', '8501',
         '--server.headless', 'true'],
        stdout=log,
        stderr=log
    )

time.sleep(8)
print(f"✅ Streamlit started (PID: {streamlit_process.pid})")


## 7. Create Public URL 🌐

**🎉 This creates your public URL! Keep this cell running!**


## 7.5 Monitor Streamlit Logs (Optional)

Run this cell to see live logs from Streamlit app.
Keep it running while you use the app to see what's happening.


In [None]:
# Monitor Streamlit logs in real-time
# Press STOP button to exit

import time
from IPython.display import clear_output

print("📊 Monitoring Streamlit logs...")
print("=" * 80)
print("Press the STOP button (■) to exit\n")

try:
    # Clear the log file first to start fresh
    with open('/tmp/streamlit.log', 'w') as f:
        f.write('')
    
    line_count = 0
    while True:
        with open('/tmp/streamlit.log', 'r') as f:
            lines = f.readlines()
        
        # Only show new lines
        if len(lines) > line_count:
            for line in lines[line_count:]:
                print(line.rstrip())
            line_count = len(lines)
        
        time.sleep(0.5)  # Update every 0.5 seconds
        
except KeyboardInterrupt:
    print("\n\n✅ Stopped monitoring logs")


In [None]:
try:
    ngrok.kill()
except:
    pass

public_url = ngrok.connect(8501, bind_tls=True)

print("="*80)
print("✅ YOUR STREAMLIT APP IS NOW PUBLIC!")
print("="*80)
print(f"\n🌐 Public URL: {public_url}")
print(f"\n{'='*80}")
print("\n📊 Open this URL in your browser!")
print("\n⚠️  Keep this notebook running!")
print(f"\n{'='*80}\n")

PUBLIC_URL = str(public_url)


---

# 🔥 PERFORMANCE PROFILING

## 8. Profile Intelligence Extraction with py-spy

Creates a flamegraph in speedscope format!

**Run this AFTER uploading CSV data in the Streamlit app**


In [None]:
# Create profiling script
profiling_script = '''
import sys, os
sys.path.insert(0, os.getcwd())

import asyncio
from src.config import get_settings
from src.database.connection import DatabaseManager
from src.database.repositories import ContributorRepository
from src.intelligence.llm_client import OllamaClient
from src.intelligence.skill_extractor import (
    has_project_descriptions,
    generate_summary_no_projects,
    generate_summary_no_descriptions,
    parse_llm_response
)
from src.models import ContributorProfile

async def profile_extraction():
    settings = get_settings()
    db_manager = DatabaseManager(settings)
    repo = ContributorRepository(db_manager)
    llm_client = OllamaClient(settings)
    
    pending = repo.get_contributors_without_intelligence()
    if not pending:
        print("No pending contributors")
        return
    
    pending = pending[:100]  # Profile first 100
    print(f"Profiling {len(pending)} contributors...")
    
    processed = 0
    failed = 0
    batch_size = settings.max_concurrent_llm
    
    # Process in batches
    for batch_start in range(0, len(pending), batch_size):
        batch_end = min(batch_start + batch_size, len(pending))
        batch = pending[batch_start:batch_end]
        
        # Convert to profiles
        profiles = []
        for contrib in batch:
            try:
                profile = ContributorProfile(**contrib["processed_data"])
                profiles.append(profile)
            except Exception as e:
                print(f"Failed to parse profile: {e}")
                failed += 1
        
        # Separate by description availability
        profiles_with_descriptions = []
        profiles_without_descriptions = []
        
        for profile in profiles:
            if not profile.production_projects:
                summary = generate_summary_no_projects(profile)
                profile.extracted_skills = []
                profiles_without_descriptions.append((profile, summary))
            elif not has_project_descriptions(profile):
                summary = generate_summary_no_descriptions(profile)
                profile.extracted_skills = []
                profiles_without_descriptions.append((profile, summary))
            else:
                profiles_with_descriptions.append(profile)
        
        # Process profiles WITH descriptions using LLM
        if profiles_with_descriptions:
            prompt_texts = []
            for profile in profiles_with_descriptions:
                # Build prompt (simplified version)
                prompt = f"""You MUST output in this EXACT format:

SUMMARY:
[Your 90-120 word paragraph here]

SKILLS:
- Skill 1
- Skill 2

===== CONTRIBUTOR DATA =====
Location: {str(profile.location)}
Languages: {", ".join([lang.language for lang in profile.languages[:3]]) if profile.languages else "Not specified"}
Education: {profile.education_level if profile.education_level else "Not specified"}
Production Projects: {profile.activity_summary.total_production_projects}

Project Descriptions:
{"\\n".join([f"{i}. [{proj.project_type}] {proj.long_desc[:400] if proj.long_desc else \\"No description\\"}" for i, proj in enumerate(profile.production_projects[:10], 1)])}

CRITICAL: You MUST include both "SUMMARY:" and "SKILLS:" headers."""
                prompt_texts.append(prompt)
            
            # Call LLM in batch
            try:
                llm_responses = await llm_client.generate_batch(prompt_texts, max_concurrent=batch_size)
                
                # Parse responses
                for profile, llm_response in zip(profiles_with_descriptions, llm_responses):
                    try:
                        if not llm_response.startswith("Error:"):
                            summary_text, skills_list = parse_llm_response(llm_response)
                            profile.extracted_skills = skills_list
                            if skills_list:
                                profile.intelligence_summary = f"{summary_text}\\n\\nSkills: {", ".join(skills_list)}"
                            else:
                                profile.intelligence_summary = summary_text
                        else:
                            profile.intelligence_summary = llm_response
                            profile.extracted_skills = []
                        
                        repo.upsert_contributor(profile)
                        processed += 1
                    except Exception as e:
                        print(f"Failed to process profile: {e}")
                        failed += 1
            except Exception as e:
                print(f"Batch LLM generation failed: {e}")
                failed += len(profiles_with_descriptions)
        
        # Update profiles WITHOUT descriptions
        for profile, summary in profiles_without_descriptions:
            try:
                profile.intelligence_summary = summary
                repo.upsert_contributor(profile)
                processed += 1
            except Exception as e:
                print(f"Failed to update profile: {e}")
                failed += 1
    
    print(f"Done! Processed: {processed}, Failed: {failed}")

asyncio.run(profile_extraction())
'''

with open('/tmp/profile_extraction.py', 'w') as f:
    f.write(profiling_script)

print("✅ Profiling script created")
print("🔥 Running py-spy profiler...")
print("   Capturing ALL function calls...")
print("")

# Run py-spy
result = subprocess.run([
    'py-spy', 'record',
    '--format', 'speedscope',
    '--output', '/tmp/flamegraph.speedscope.json',
    '--rate', '100',
    '--',
    'python3', '/tmp/profile_extraction.py'
], capture_output=True, text=True)

if result.returncode == 0:
    print("\n✅ Profiling complete!")
    print("\n📊 Flamegraph: /tmp/flamegraph.speedscope.json")
    print("\n🌐 To visualize:")
    print("   1. Download the file (see next cell)")
    print("   2. Go to: https://www.speedscope.app/")
    print("   3. Drag & drop the JSON")
    
    file_size = os.path.getsize('/tmp/flamegraph.speedscope.json')
    print(f"\n📦 Size: {file_size / 1024:.1f} KB")
else:
    print(f"\n❌ Error:\n{result.stderr}")


## 9. Download Flamegraph


In [None]:
from IPython.display import FileLink, display

flamegraph = '/tmp/flamegraph.speedscope.json'

if os.path.exists(flamegraph):
    print("📥 Download your flamegraph:")
    display(FileLink(flamegraph))
    print("\n🌐 Then visit: https://www.speedscope.app/")
    print("   Drag & drop to visualize!")
else:
    print("❌ Run cell 8 first to generate flamegraph")


## 10. Stop All Services


In [None]:
print("🛑 Stopping services...")

try:
    ngrok.kill()
    print("✅ Ngrok stopped")
except:
    pass

subprocess.run(['pkill', '-f', 'streamlit'], capture_output=True)
print("✅ Streamlit stopped")

subprocess.run(['pkill', 'ollama'], capture_output=True)
print("✅ Ollama stopped")

print("\n✅ All services stopped")
