<a href="https://colab.research.google.com/github/waqasm86/Kaggle-Dropbox-HuggingFace/blob/main/p1_colab_gradio_langfuse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -q -y transformers datasets accelerate > /dev/null
!pip install -q transformers datasets accelerate --upgrade > /dev/null
import transformers, datasets, accelerate


In [None]:
print(f"transformers version: {transformers.__version__}")
print(f"accelerate version: {accelerate.__version__}")
print(f"datasets version: {datasets.__version__}")


transformers version: 4.55.4
accelerate version: 1.10.1
datasets version: 4.0.0


In [None]:
# First, install required packages
!pip install -qU gradio langfuse plotly sentencepiece protobuf --upgrade > /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.19.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.32.0 which is incompatible.
google-ai-generativelanguage 0.6.15 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 6.32.0 which is incompatible.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 6.32.0 which is incompatible.[0m[31m
[0m

In [None]:


import gradio as gr
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import langfuse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import warnings
warnings.filterwarnings('ignore')

# Initialize Langfuse (replace with your actual credentials)
langfuse_client = langfuse.Langfuse(
    public_key="pk-lf-12345678-1234-1234-1234-123456789012",
    secret_key="sk-lf-12345678-1234-1234-1234-123456789012",
    host="https://cloud.langfuse.com"
)

# Load a lightweight model that can run on T4 GPU
MODEL_NAME = "microsoft/DialoGPT-small"  # Very lightweight model

class LightweightLLM:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.trace = None

    def generate_response(self, prompt: str, max_length: int = 100):
        """Generate response with Langfuse observability"""
        # Start a new trace
        trace = langfuse_client.trace(
            name="llm-generation",
            user_id="demo-user",
            metadata={"model": MODEL_NAME, "device": str(self.device)}
        )

        generation = trace.generation(
            name="chat-completion",
            model=MODEL_NAME,
            model_parameters={"max_length": max_length, "temperature": 0.7}
        )

        try:
            inputs = self.tokenizer.encode(prompt + self.tokenizer.eos_token, return_tensors="pt")
            inputs = inputs.to(self.device)

            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_length=max_length,
                    pad_token_id=self.tokenizer.eos_token_id,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9
                )

            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = response.replace(prompt, "").strip()

            # Update generation with output
            generation.end(
                output=response,
                usage={"input_tokens": len(inputs[0]), "output_tokens": len(outputs[0])}
            )

            # Log quality score
            trace.score(
                name="response_quality",
                value=random.uniform(0.7, 0.95),  # Simulated quality score
                comment="Automated quality assessment"
            )

            return response

        except Exception as e:
            generation.end(
                output=f"Error: {str(e)}",
                status_message=str(e)
            )

            trace.score(
                name="error",
                value=0.0,
                comment=f"Error: {str(e)}"
            )
            return f"Error generating response: {str(e)}"

# Initialize the model
llm = LightweightLLM()

# Mock data for demonstration
def generate_mock_metrics():
    """Generate mock observability metrics"""
    dates = [datetime.now() - timedelta(hours=i) for i in range(24)]

    return {
        'latency': [random.uniform(0.1, 2.0) for _ in range(24)],
        'throughput': [random.randint(50, 200) for _ in range(24)],
        'error_rate': [random.uniform(0.01, 0.1) for _ in range(24)],
        'quality_score': [random.uniform(0.6, 0.95) for _ in range(24)],
        'dates': dates,
        'token_usage': [random.randint(100, 500) for _ in range(24)]
    }

def create_latency_chart():
    """Create latency visualization"""
    metrics = generate_mock_metrics()

    fig = px.line(
        x=metrics['dates'],
        y=metrics['latency'],
        title='Response Latency (seconds)',
        labels={'x': 'Time', 'y': 'Latency (s)'}
    )
    fig.update_traces(line=dict(color='blue'))
    return fig

def create_throughput_chart():
    """Create throughput visualization"""
    metrics = generate_mock_metrics()

    fig = px.bar(
        x=metrics['dates'],
        y=metrics['throughput'],
        title='Requests Throughput',
        labels={'x': 'Time', 'y': 'Requests per hour'}
    )
    fig.update_traces(marker_color='green')
    return fig

def create_quality_chart():
    """Create quality score visualization"""
    metrics = generate_mock_metrics()

    fig = px.scatter(
        x=metrics['dates'],
        y=metrics['quality_score'],
        title='Response Quality Scores',
        labels={'x': 'Time', 'y': 'Quality Score (0-1)'}
    )
    fig.update_traces(marker=dict(color='orange', size=10))
    return fig

def create_comprehensive_dashboard():
    """Create a comprehensive dashboard with multiple metrics"""
    metrics = generate_mock_metrics()

    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=('Latency', 'Throughput', 'Error Rate', 'Quality Score', 'Token Usage', 'Summary'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )

    # Latency
    fig.add_trace(
        go.Scatter(x=metrics['dates'], y=metrics['latency'], name="Latency", line=dict(color='blue')),
        row=1, col=1
    )

    # Throughput
    fig.add_trace(
        go.Bar(x=metrics['dates'], y=metrics['throughput'], name="Throughput", marker_color='green'),
        row=1, col=2
    )

    # Error Rate
    fig.add_trace(
        go.Scatter(x=metrics['dates'], y=metrics['error_rate'], name="Error Rate", line=dict(color='red')),
        row=2, col=1
    )

    # Quality Score
    fig.add_trace(
        go.Scatter(x=metrics['dates'], y=metrics['quality_score'], name="Quality", line=dict(color='orange')),
        row=2, col=2
    )

    # Token Usage
    fig.add_trace(
        go.Bar(x=metrics['dates'], y=metrics['token_usage'], name="Tokens", marker_color='purple'),
        row=3, col=1
    )

    # Summary stats
    summary_stats = [
        f"Avg Latency: {np.mean(metrics['latency']):.2f}s",
        f"Avg Throughput: {np.mean(metrics['throughput']):.0f}/hr",
        f"Avg Error Rate: {np.mean(metrics['error_rate']):.2%}",
        f"Avg Quality: {np.mean(metrics['quality_score']):.2f}"
    ]

    fig.add_trace(
        go.Scatter(
            x=[0, 1, 2, 3],
            y=[0, 0, 0, 0],
            text=summary_stats,
            mode="text",
            textposition="middle center",
            showlegend=False
        ),
        row=3, col=2
    )

    fig.update_layout(height=800, showlegend=True, title_text="LLM Observability Dashboard")
    return fig

def chat_with_llm(message, history):
    """Chat interface with the LLM"""
    response = llm.generate_response(message)
    return response

def get_recent_traces():
    """Get recent traces from Langfuse (mock data for demo)"""
    try:
        # For demo purposes, we'll create mock trace data
        # In a real implementation, you would use: traces = langfuse_client.get_traces(limit=10)
        trace_data = []
        for i in range(10):
            trace_data.append({
                'id': f"trace-{i}",
                'name': f"Generation {i}",
                'timestamp': (datetime.now() - timedelta(minutes=random.randint(1, 60))).isoformat(),
                'user_id': f"user-{random.randint(1, 5)}",
                'metadata': json.dumps({"model": MODEL_NAME, "status": "completed"})
            })

        return pd.DataFrame(trace_data)

    except Exception as e:
        return pd.DataFrame({'Error': [f"Failed to fetch traces: {str(e)}"]})

# Create the Gradio interface
with gr.Blocks(title="LLM Observability Dashboard", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# üöÄ LLM Observability Dashboard")
    gr.Markdown("Monitor your LLM performance with real-time metrics and analytics")

    with gr.Tab("Live Metrics"):
        with gr.Row():
            with gr.Column():
                latency_plot = gr.Plot(label="Latency", value=create_latency_chart())
            with gr.Column():
                throughput_plot = gr.Plot(label="Throughput", value=create_throughput_chart())

        with gr.Row():
            with gr.Column():
                quality_plot = gr.Plot(label="Quality Scores", value=create_quality_chart())
            with gr.Column():
                comprehensive_dash = gr.Plot(label="Comprehensive Dashboard", value=create_comprehensive_dashboard())

        refresh_btn = gr.Button("üîÑ Refresh Metrics")

    with gr.Tab("Chat Interface"):
        gr.Markdown("### Test the LLM with real-time monitoring")
        chatbot = gr.ChatInterface(
            fn=chat_with_llm,
            title="LLM Chat with Observability",
            description="Chat with the model while monitoring performance metrics"
        )

    with gr.Tab("Langfuse Traces"):
        gr.Markdown("### Recent Traces from Langfuse")
        traces_table = gr.Dataframe(
            label="Recent Traces",
            value=get_recent_traces,
            every=30  # Refresh every 30 seconds
        )
        refresh_traces = gr.Button("üîÑ Refresh Traces")

    with gr.Tab("Model Info"):
        gr.Markdown(f"### Model: {MODEL_NAME}")
        gr.Markdown(f"**Device:** {llm.device}")
        gr.Markdown(f"**Parameters:** ~{llm.model.num_parameters():,}")
        gr.Markdown("**Capabilities:** Text generation, conversation")

        # Model statistics
        stats_data = {
            'Metric': ['Avg Latency', 'Peak Throughput', 'Error Rate', 'Avg Quality'],
            'Value': ['0.8s', '200 req/hr', '2.5%', '0.85'],
            'Status': ['‚úÖ Good', '‚úÖ Good', '‚ö†Ô∏è Warning', '‚úÖ Good']
        }
        stats_df = pd.DataFrame(stats_data)
        gr.Dataframe(stats_df, label="Current Performance")

    # Refresh functionality
    def refresh_all():
        return [
            create_latency_chart(),
            create_throughput_chart(),
            create_quality_chart(),
            create_comprehensive_dashboard(),
            get_recent_traces()
        ]

    refresh_btn.click(
        fn=refresh_all,
        outputs=[latency_plot, throughput_plot, quality_plot, comprehensive_dash, traces_table]
    )

    refresh_traces.click(
        fn=get_recent_traces,
        outputs=traces_table
    )

# Launch the dashboard
if __name__ == "__main__":
    # This will open in a new tab when run locally
    demo.launch(
        share=True,  # Set to True if you want a public link
        server_name="localhost",
        server_port=8083,
        show_error=True,
        #debug=True

    )

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b85c651f2febe2e95b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
