In [26]:
# ========================================
# PART 1: Setup and Installation (Run this first)
# ========================================

# Install required packages
!pip install -q streamlit langchain langchain-community langchain-ollama
!pip install -q pypdf docx2txt unstructured sentence-transformers
!pip install -q faiss-cpu langchain-huggingface transformers torch
!pip install -q pyngrok

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install and setup Ollama in Colab
!curl -fsSL https://ollama.com/install.sh | sh

# Start Ollama service in background
import subprocess
import time
import os

# Set Ollama models directory to your Google Drive
os.environ['OLLAMA_MODELS'] = '/content/drive/MyDrive/Ollama_Models'

# Kill any existing Ollama processes
!pkill -f ollama

time.sleep(2)

# Start Ollama server in background with proper output handling
print("Starting Ollama server...")
with open('/tmp/ollama.log', 'w') as f:
    ollama_process = subprocess.Popen(
        ['ollama', 'serve'],
        stdout=f,
        stderr=subprocess.STDOUT,
        env=os.environ.copy()
    )

# Wait and verify server is running
time.sleep(10)

# Check if Ollama is responding
max_retries = 5
for i in range(max_retries):
    try:
        result = subprocess.run(['ollama', 'list'], capture_output=True, timeout=5)
        if result.returncode == 0:
            print("✓ Ollama server is running!")
            break
    except:
        pass

    if i < max_retries - 1:
        print(f"Waiting for Ollama to start... ({i+1}/{max_retries})")
        time.sleep(5)
    else:
        print("⚠️ Ollama may not have started properly. Check /tmp/ollama.log")
        print("\nOllama log (last 20 lines):")
        !tail -20 /tmp/ollama.log

# Verify the model files exist
model_path = '/content/drive/MyDrive/Ollama_Models'
if os.path.exists(model_path):
    print(f"✓ Found Ollama_Models folder")
    print(f"Contents: {os.listdir(model_path)}")

    # Check for blobs and manifests
    if os.path.exists(os.path.join(model_path, 'blobs')):
        print(f"✓ Found blobs folder")
    if os.path.exists(os.path.join(model_path, 'manifests')):
        print(f"✓ Found manifests folder")
        manifests_path = os.path.join(model_path, 'manifests')
        for root, dirs, files in os.walk(manifests_path):
            print(f"  Manifest structure: {root}")
            print(f"  Dirs: {dirs}")
            print(f"  Files: {files}")
else:
    print("✗ Ollama_Models folder not found!")

# List available models
print("\nChecking available models...")
!ollama list

print("\n⚠️ IMPORTANT: If gemma3:1b is not listed above, we'll need to pull it.")
print("The model files in your Drive need to be properly recognized by Ollama.")
print("\nOptions:")
print("1. Pull the model: !ollama pull gemma3:1b")
print("2. Or use a different model that's available")

print("\n✓ Setup complete! Now run PART 2 to create the app file.")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Starting Ollama server...
✓ Ollama server is running!
✓ Found Ollama_Models folder
Contents: ['blobs', 'manifests']
✓ Found blobs folder
✓ Found manifests folder
  Manifest structure: /content/drive/MyDrive/Ollama_Models/manifests
  Dirs: ['registry.ollama.ai']
  Files: []
  Manifest structure: /content/drive/MyDrive/Ollama_Models/manifests/registry.ollama.ai
  Dirs: ['library']
  Files: []
  Manifest structure: /content/drive

In [27]:
# ========================================
# PART 1.5: Pull/Verify Gemma Model (Run after PART 1)
# ========================================

# Available small models for Colab (sorted by size):
# - tinyllama (637 MB) - Fastest, less accurate
# - llama3.2:1b (1.3 GB) - Good balance
# - phi3:mini (2.3 GB) - Better quality
# - gemma3:1b (1.6 GB) - Good balance, what you wanted
# - gemma2:2b (1.6 GB) - Better than gemma3:1b

MODEL_TO_USE = "gemma3:1b"  # Change this if you want a different model

print(f"Checking for {MODEL_TO_USE} model...")

# List current models
import subprocess
result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
models_output = result.stdout

if MODEL_TO_USE in models_output:
    print(f"✓ {MODEL_TO_USE} model found!")
else:
    print(f"⚠️ {MODEL_TO_USE} not found. Pulling from Ollama library...")
    print("This will take a few minutes...")
    !ollama pull {MODEL_TO_USE}
    print("✓ Model downloaded!")

# Verify model is now available
!ollama list

print(f"\n✓ Model verification complete! Using: {MODEL_TO_USE}")
print("Now run PART 2 to create the app file.")

Checking for gemma3:1b model...
✓ gemma3:1b model found!
NAME         ID              SIZE      MODIFIED       
gemma3:1b    8648f39daa8f    815 MB    35 minutes ago    

✓ Model verification complete! Using: gemma3:1b
Now run PART 2 to create the app file.


In [28]:
# ========================================
# PART 2: Create the Streamlit App (Run after PART 1.5)
# ========================================

%%writefile app.py
import os
import warnings
import sys

# Suppress all warnings before imports
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Suppress protobuf warnings
import logging
logging.getLogger('google.protobuf').setLevel(logging.ERROR)

import streamlit as st
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaLLM
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
import shutil

# Configuration
MODEL_NAME = "gemma3:1b"  # Model to use (should match PART 1.5)

# Try to import HuggingFace embeddings
try:
    from langchain_huggingface import HuggingFaceEmbeddings
    EMBEDDING_TYPE = "huggingface"
except:
    from langchain_ollama import OllamaEmbeddings
    EMBEDDING_TYPE = "ollama"

# Check for GPU availability
try:
    import torch
    GPU_AVAILABLE = torch.cuda.is_available()
    DEVICE = 'cuda' if GPU_AVAILABLE else 'cpu'
except ImportError:
    GPU_AVAILABLE = False
    DEVICE = 'cpu'

# Function to check if Ollama is running
def check_ollama_health():
    """Check if Ollama server is responding"""
    try:
        import subprocess
        result = subprocess.run(
            ['curl', '-s', 'http://localhost:11434/api/tags'],
            capture_output=True,
            timeout=2
        )
        return result.returncode == 0
    except:
        return False

# Check Ollama health on startup
if not check_ollama_health():
    st.error("⚠️ Ollama server is not responding!")
    st.info("Please restart the Ollama server in your Colab notebook:")
    st.code("""
# Run this in a new Colab cell:
import subprocess
import os

# Kill existing Ollama
!pkill -f ollama

# Start Ollama again
os.environ['OLLAMA_MODELS'] = '/content/drive/MyDrive/Ollama_Models'
with open('/tmp/ollama.log', 'w') as f:
    subprocess.Popen(['ollama', 'serve'], stdout=f, stderr=subprocess.STDOUT)
    """)
    st.stop()

# Set paths for Google Drive
DRIVE_ROOT = '/content/drive/MyDrive'
os.environ['OLLAMA_MODELS'] = os.path.join(DRIVE_ROOT, 'Ollama_Models')

vector_space_dir = os.path.join(os.getcwd(), "vector_db")
os.makedirs(vector_space_dir, exist_ok=True)

st.set_page_config(page_title="RAG ChatBot", layout="centered")
st.title(f"RAG ChatBot (Langchain + {MODEL_NAME})")

if 'vectorstore' not in st.session_state:
    st.session_state['vectorstore'] = None
if 'memory' not in st.session_state:
    st.session_state['memory'] = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key='answer'
    )
if 'retriever' not in st.session_state:
    st.session_state['retriever'] = None
if 'embedding_model' not in st.session_state:
    st.session_state['embedding_model'] = None

upload_file = st.file_uploader("Upload PDF or Word file", type=["pdf", "docx", "doc"], key='upload_file')

def get_embedding_model():
    """Initialize embedding model only when needed"""
    if st.session_state['embedding_model'] is None:
        with st.spinner("Loading embedding model..."):
            try:
                # Path to local model in Google Drive
                local_model_path = os.path.join(DRIVE_ROOT, "local_model")

                if os.path.exists(local_model_path):
                    st.info(f"Found local model in Drive, loading on {DEVICE.upper()}...")
                    os.environ["TOKENIZERS_PARALLELISM"] = "false"

                    st.session_state['embedding_model'] = HuggingFaceEmbeddings(
                        model_name=local_model_path,
                        model_kwargs={
                            'device': DEVICE,
                            'trust_remote_code': True
                        },
                        encode_kwargs={'normalize_embeddings': True}
                    )
                    st.success(f"✓ Using local HuggingFace embeddings on {DEVICE.upper()}")

                elif EMBEDDING_TYPE == "huggingface":
                    os.environ["TOKENIZERS_PARALLELISM"] = "false"

                    st.session_state['embedding_model'] = HuggingFaceEmbeddings(
                        model_name="sentence-transformers/all-MiniLM-L6-v2",
                        model_kwargs={
                            'device': DEVICE,
                            'trust_remote_code': True
                        },
                        encode_kwargs={'normalize_embeddings': True}
                    )
                    st.info(f"✓ Using HuggingFace embeddings on {DEVICE.upper()}")
                else:
                    st.session_state['embedding_model'] = OllamaEmbeddings(
                        model="nomic-embed-text",
                        base_url="http://localhost:11434"
                    )
                    st.info("✓ Using Ollama embeddings")

            except Exception as e:
                st.error(f"HuggingFace embedding error: {str(e)}")
                st.warning("Trying Ollama embeddings as fallback...")
                try:
                    from langchain_ollama import OllamaEmbeddings
                    st.session_state['embedding_model'] = OllamaEmbeddings(
                        model="nomic-embed-text",
                        base_url="http://localhost:11434"
                    )
                    st.info("✓ Using Ollama embeddings (fallback)")
                except Exception as e2:
                    st.error(f"Failed to initialize embeddings: {str(e2)}")
                    return None

    return st.session_state['embedding_model']

if upload_file is not None and st.session_state['vectorstore'] is None:
    with st.spinner("Loading document and creating vector DB...."):
        try:
            file_path = os.path.join(os.getcwd(), upload_file.name)
            with open(file_path, "wb") as f:
                f.write(upload_file.getbuffer())
            st.session_state['file_path'] = file_path

            file_extension = os.path.splitext(upload_file.name)[1].lower()
            st.info(f"Processing {file_extension} file...")

            if file_extension == '.pdf':
                loader = PyPDFLoader(file_path)
            elif file_extension == '.docx':
                loader = Docx2txtLoader(file_path)
            elif file_extension == '.doc':
                loader = UnstructuredWordDocumentLoader(file_path)

            documents = loader.load()
            st.success(f"✓ Loaded {len(documents)} document pages/sections")

            embedding_model = get_embedding_model()

            if embedding_model is None:
                st.error("Failed to initialize embedding model.")
            else:
                vectorstore = FAISS.from_documents(documents, embedding_model)
                vectorstore.save_local(vector_space_dir)
                st.session_state['vectorstore'] = vectorstore
                st.session_state['retriever'] = vectorstore.as_retriever(search_kwargs={"k": 2})
                st.success("✓ Vector DB Created successfully!")

        except Exception as e:
            st.error(f"Error processing document: {str(e)}")
            import traceback
            st.code(traceback.format_exc())

# Initialize LLM model
# Options: "gemma3:1b", "llama3.2:1b", "phi3:mini", "tinyllama"
MODEL_NAME = "gemma3:1b"  # Change this if you want to use a different model

llm = OllamaLLM(
    model=MODEL_NAME,
    base_url="http://localhost:11434",
    temperature=0.7,
    timeout=60,
    num_gpu=1 if GPU_AVAILABLE else 0
)

if st.session_state['retriever'] is not None:
    user_question = st.text_input("Ask your question:", key='text')

    if user_question:
        # Check Ollama health before processing
        if not check_ollama_health():
            st.error("⚠️ Ollama server connection lost!")
            st.info("Please restart Ollama in your Colab notebook and refresh this page.")
            st.stop()

        with st.spinner("Thinking...."):
            try:
                qa_chain = ConversationalRetrievalChain.from_llm(
                    llm=llm,
                    retriever=st.session_state['retriever'],
                    memory=st.session_state['memory'],
                    return_source_documents=False,
                    verbose=True
                )

                result = qa_chain.invoke({"question": user_question})

                if isinstance(result, dict):
                    answer = result.get('answer', result.get('result', 'No answer generated'))
                else:
                    answer = str(result)

                st.markdown(f"**You:** {user_question}")
                st.markdown(f"**Bot:** {answer}")

            except Exception as e:
                error_msg = str(e)
                if "Cannot assign requested address" in error_msg or "ConnectError" in error_msg:
                    st.error("⚠️ Lost connection to Ollama server!")
                    st.info("The Ollama server stopped responding. Please restart it:")
                    st.code("""
# Run in Colab:
!pkill -f ollama
import subprocess, os, time
os.environ['OLLAMA_MODELS'] = '/content/drive/MyDrive/Ollama_Models'
with open('/tmp/ollama.log', 'w') as f:
    subprocess.Popen(['ollama', 'serve'], stdout=f, stderr=subprocess.STDOUT)
time.sleep(10)
!ollama list
                    """)
                else:
                    st.error(f"Error generating response: {error_msg}")
                    import traceback
                    with st.expander("Show detailed error"):
                        st.code(traceback.format_exc())

def del_vectordb(path):
    try:
        if os.path.exists(path):
            shutil.rmtree(path)
    except Exception as e:
        st.warning(f"Could not delete vector DB: {str(e)}")

def del_uploaded_file(path):
    try:
        if os.path.exists(path) and path:
            os.remove(path)
    except Exception as e:
        st.warning(f"Could not delete file: {str(e)}")

if st.button("Clear Session"):
    st.session_state['memory'].clear()
    st.session_state['retriever'] = None
    st.session_state['vectorstore'] = None
    st.session_state['embedding_model'] = None
    del_vectordb(vector_space_dir)
    file_p = st.session_state.get('file_path', None)
    del_uploaded_file(file_p)
    st.session_state['file_path'] = None
    for key in ['upload_file', 'text']:
        if key in st.session_state:
            del st.session_state[key]
    st.success('Session, document and VectorDB are cleared')
    st.rerun()

with st.sidebar:
    st.header("Configuration")

    # Ollama health check
    ollama_healthy = check_ollama_health()
    if ollama_healthy:
        st.success("✓ Ollama server: Running")
    else:
        st.error("✗ Ollama server: Not responding")
        st.button("Show restart instructions", key="restart_btn")
        if st.session_state.get("restart_btn"):
            st.code("""
# Run in Colab:
!pkill -f ollama
import subprocess, os, time
os.environ['OLLAMA_MODELS'] = '/content/drive/MyDrive/Ollama_Models'
with open('/tmp/ollama.log', 'w') as f:
    subprocess.Popen(['ollama', 'serve'], stdout=f, stderr=subprocess.STDOUT)
time.sleep(10)
            """)

    st.markdown(f"""
    **Ollama Models Path:**
    ```
    {os.environ.get('OLLAMA_MODELS', 'Not set')}
    ```

    **Current LLM Model:** {MODEL_NAME}

    **Device:** {DEVICE.upper()}

    **GPU Available:** {'✓ Yes' if GPU_AVAILABLE else '✗ No (using CPU)'}
    """)

    if GPU_AVAILABLE:
        st.success(f"🚀 GPU Acceleration Enabled!")
        try:
            import torch
            gpu_name = torch.cuda.get_device_name(0)
            st.info(f"GPU: {gpu_name}")
        except:
            pass
    else:
        st.warning("⚠️ Running on CPU (slower)")

    st.header("Status")

    local_model_path = os.path.join(DRIVE_ROOT, "local_model")
    if os.path.exists(local_model_path):
        st.success("✓ Local embedding model found in Drive")
    else:
        st.warning("⚠ Local model not found in Drive")

    if st.session_state['vectorstore'] is not None:
        st.success("✓ Document loaded")
    else:
        st.info("ℹ No document loaded yet")


Writing app.py


In [29]:
# ========================================
# PART 3: Run the App (Run after PART 2)
# ========================================

# Setup ngrok for public URL (optional, but recommended for Colab)
!pip install -q pyngrok

from pyngrok import ngrok
import getpass

# Get ngrok auth token (sign up at https://ngrok.com for free)
print("Get your free ngrok token from: https://dashboard.ngrok.com/get-started/your-authtoken")
ngrok_token = getpass.getpass("Enter your ngrok auth token: ")
ngrok.set_auth_token(ngrok_token)

# Start Streamlit in background
!streamlit run app.py &>/dev/null &

# Wait for Streamlit to start
import time
time.sleep(5)

# Create ngrok tunnel
public_url = ngrok.connect(8501)
print(f"\n✓ App is running!")
print(f"🌐 Public URL: {public_url}")
print(f"📱 Open this URL in your browser to access the app")

# Keep the tunnel alive
ngrok_process = ngrok.get_tunnels()
print("\n⚠️ Keep this cell running to maintain the connection!")

# Alternative: Run without ngrok (local only)
# Uncomment this if you don't want to use ngrok:
# !streamlit run app.py --server.port 8501

# ========================================
# HELPER: Restart Ollama (Run this if Ollama crashes)
# ========================================

# Run this cell if you get "Cannot assign requested address" error
import subprocess
import os
import time

print("Restarting Ollama server...")

# Kill any existing Ollama processes
!pkill -f ollama
time.sleep(2)

# Start Ollama server
os.environ['OLLAMA_MODELS'] = '/content/drive/MyDrive/Ollama_Models'

with open('/tmp/ollama.log', 'w') as f:
    ollama_process = subprocess.Popen(
        ['ollama', 'serve'],
        stdout=f,
        stderr=subprocess.STDOUT,
        env=os.environ.copy()
    )

print("Waiting for Ollama to start...")
time.sleep(10)

# Verify it's working
result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
if result.returncode == 0:
    print("✓ Ollama server restarted successfully!")
    print("\nAvailable models:")
    print(result.stdout)
else:
    print("⚠️ Ollama may have issues. Check the log:")
    !tail -20 /tmp/ollama.log

print("\n🔄 Now refresh your Streamlit app in the browser")

# ========================================
# HELPER: Check Ollama Status
# ========================================

# Run this to check if Ollama is running
!ps aux | grep ollama | grep -v grep
print("\nOllama models:")
!ollama list

print("\nTest Ollama connection:")
!curl -s http://localhost:11434/api/tags

print("\n\nOllama log (last 20 lines):")
!tail -20 /tmp/ollama.lo

Get your free ngrok token from: https://dashboard.ngrok.com/get-started/your-authtoken
Enter your ngrok auth token: ··········

✓ App is running!
🌐 Public URL: NgrokTunnel: "https://07279fbedaa2.ngrok-free.app" -> "http://localhost:8501"
📱 Open this URL in your browser to access the app

⚠️ Keep this cell running to maintain the connection!
Restarting Ollama server...
Waiting for Ollama to start...
✓ Ollama server restarted successfully!

Available models:
NAME         ID              SIZE      MODIFIED       
gemma3:1b    8648f39daa8f    815 MB    37 minutes ago    


🔄 Now refresh your Streamlit app in the browser
root       15085  1.2  0.3 6386716 45448 ?       Sl   01:17   0:00 ollama serve

Ollama models:
NAME         ID              SIZE      MODIFIED       
gemma3:1b    8648f39daa8f    815 MB    37 minutes ago    

Test Ollama connection:
{"models":[{"name":"gemma3:1b","model":"gemma3:1b","modified_at":"2025-10-01T00:39:43Z","size":815319791,"digest":"8648f39daa8fbf5b18c7b4e6a8f