# CIM Assistant with Hugging Face Models

This notebook uses Hugging Face models instead of Ollama for local inference.
Perfect for when your Ollama server is unavailable!


In [None]:
# LangChain imports
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.utilities import SQLDatabase
from langchain_community.agent_toolkits import SQLDatabaseToolkit
from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool, InfoSQLDatabaseTool, ListSQLDatabaseTool, QuerySQLCheckerTool
from langgraph.prebuilt import create_react_agent
from dotenv import load_dotenv
import os

# Hugging Face integration
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

print("✅ All imports loaded successfully!")


In [None]:
# Load environment variables
load_dotenv()
print("✅ Environment variables loaded!")


## Model Selection

Choose your preferred model based on your needs:
- **Qwen2.5-7B**: Great for coding, reasoning, and SQL tasks
- **Mistral-7B**: Excellent for instruction following
- **CodeLlama-7B**: Specialized for code generation
- **Llama-3.1-8B**: Meta's latest model with great performance


In [None]:
# Model Configuration
# Change this to your preferred model
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"  # Default choice

# Alternative models (uncomment one to use):
# MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
# MODEL_NAME = "codellama/CodeLlama-7b-Instruct-hf"
# MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

print(f"Selected model: {MODEL_NAME}")


In [None]:
# Load Hugging Face Model
print(f"🔄 Loading {MODEL_NAME}...")
print("This may take a few minutes on first run...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load model with memory optimization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Use half precision to save memory
    device_map="auto",  # Automatically distribute across available GPUs
    trust_remote_code=True,
    low_cpu_mem_usage=True  # Reduce CPU memory usage during loading
)

print("✅ Model loaded successfully!")


In [None]:
# Create Hugging Face Pipeline
print("🔄 Creating pipeline...")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.0,
    do_sample=True,
    return_full_text=False,
    pad_token_id=tokenizer.eos_token_id  # Handle padding
)

# Initialize LangChain LLM
llm = HuggingFacePipeline(pipeline=pipe)

print("✅ Pipeline created and LangChain LLM initialized!")


In [None]:
# Test the Hugging Face model
print("🧪 Testing Hugging Face model...")
print("-" * 50)

test_prompt = "Hello! Can you help me with SQL queries? Please respond briefly."
response = llm.invoke(test_prompt)

print(f"Prompt: {test_prompt}")
print(f"Response: {response}")
print("-" * 50)
print("✅ Model is working correctly!")


## Database Connection Setup

Now set up your database connection (same as your original notebook)


In [None]:
# Database connection setup
# Update these connection details for your database
DB_USER = "your_username"
DB_PASSWORD = "your_password"
DB_HOST = "localhost"
DB_PORT = "5432"
DB_NAME = "your_database"

# Create database URI
DATABASE_URI = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

print(f"Database URI: {DATABASE_URI}")
print("⚠️  Please update the database credentials above!")


In [None]:
# Initialize SQL Database
try:
    db = SQLDatabase.from_uri(DATABASE_URI)
    print("✅ Database connection successful!")
    
    # Test database connection
    tables = db.get_usable_table_names()
    print(f"📊 Available tables: {tables}")
    
except Exception as e:
    print(f"❌ Database connection failed: {e}")
    print("Please check your database credentials and connection.")


## SQL Agent Setup

Create the SQL agent with your Hugging Face model


In [None]:
# Create SQL Database Toolkit
toolkit = SQLDatabaseToolkit(db=db, llm=llm)

# Get tools from toolkit
tools = toolkit.get_tools()

print(f"🔧 Available SQL tools: {[tool.name for tool in tools]}")
print("✅ SQL toolkit created successfully!")


In [None]:
# Create SQL Agent
agent = create_react_agent(
    llm=llm,
    tools=tools,
    state_modifier=ChatPromptTemplate.from_messages([
        ("system", """You are a helpful SQL assistant. 
        You can help users write SQL queries, analyze database schemas, and answer questions about data.
        Always be precise and helpful in your responses."""),
        ("placeholder", "{messages}")
    ])
)

print("✅ SQL Agent created successfully!")
print("🤖 Your Hugging Face-powered SQL assistant is ready!")


## Test Your Agent

Try some SQL queries with your Hugging Face model!


In [None]:
# Test the agent with a simple query
test_query = "What tables are available in the database?"

print(f"🔍 Testing query: {test_query}")
print("-" * 60)

try:
    result = agent.invoke({"messages": [HumanMessage(content=test_query)]})
    print(f"Response: {result['messages'][-1].content}")
except Exception as e:
    print(f"❌ Error: {e}")

print("-" * 60)


## Memory Optimization Options

If you're running into memory issues, try these configurations:


In [None]:
# Memory optimization configurations

# Option 1: 4-bit quantization (requires bitsandbytes)
"""
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True,  # 4-bit quantization
    trust_remote_code=True
)
"""

# Option 2: 8-bit quantization
"""
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_8bit=True,  # 8-bit quantization
    trust_remote_code=True
)
"""

# Option 3: CPU-only inference (slower but uses less GPU memory)
"""
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    device_map="cpu",  # Force CPU usage
    trust_remote_code=True
)
"""

print("💡 Memory optimization options available above.")
print("Uncomment the configuration you want to use.")


## Model Performance Comparison

Here's a quick comparison of different models for SQL tasks:


In [None]:
# Model recommendations for different tasks
model_recommendations = {
    "SQL & Database Tasks": [
        "Qwen/Qwen2.5-7B-Instruct",
        "mistralai/Mistral-7B-Instruct-v0.3",
        "meta-llama/Llama-3.1-8B-Instruct"
    ],
    "Code Generation": [
        "codellama/CodeLlama-7b-Instruct-hf",
        "Qwen/Qwen2.5-7B-Instruct",
        "mistralai/Mistral-7B-Instruct-v0.3"
    ],
    "General Reasoning": [
        "meta-llama/Llama-3.1-8B-Instruct",
        "Qwen/Qwen2.5-7B-Instruct",
        "mistralai/Mistral-7B-Instruct-v0.3"
    ],
    "Low Memory (4-bit)": [
        "Qwen/Qwen2.5-7B-Instruct",
        "mistralai/Mistral-7B-Instruct-v0.3"
    ]
}

for task, models in model_recommendations.items():
    print(f"\n📋 {task}:")
    for i, model in enumerate(models, 1):
        print(f"  {i}. {model}")

print("\n💡 Choose the model that best fits your needs and hardware!")
print("🔄 To switch models, just change the MODEL_NAME variable and re-run the loading cells.")
