In [1]:
from langchain_community.llms.ollama import Ollama
from langchain.prompts import ChatPromptTemplate

In [11]:
# llm = Ollama(model="gemma2:2b")

In [3]:
import subprocess
import os
# Function to stream output of the subprocess
def stream_output(process):
    for line in process.stdout:
        print(line, end='')  # Output is already a string, so no need to decode

# Function to start Ollama
def start_ollama():
    try:
        # Redirect stdout and stderr to os.devnull
        with open(os.devnull, 'w') as devnull:
            ollama_process = subprocess.Popen(
                ["ollama", "serve"], 
                stdout=devnull,  # Discard stdout
                stderr=devnull,  # Discard stderr
                shell=True       # Required for Windows
            )
            print("Ollama is starting...", flush=True)
            return ollama_process
    except Exception as e:
        print(f"Error starting Ollama: {e}", flush=True)
        return None



# When Backend starts
ollama_process = start_ollama()

Ollama is starting...


In [4]:
import os
os.getcwd()


'e:\\CSE299\\chatbot\\rag_tests'

In [5]:
import json

models = [
    "llama3.2:1b",
    "llama3.2:3b",
    "mistral:latest",
    "llama3.1:latest",
    "gemma2:2b"
]

questions = [
    "What is Newton's first law of motion?",
    "Explain the law of conservation of energy.",
    "What is the difference between velocity and speed?",
    "Define momentum and give an example.",
    "What is the formula for kinetic energy?"
]

# Function to get the model's response to a question
def get_model_answer(model, query):
    return model(query)

results = []

# Loop through models and store answers
for model_name in models:
    print(f"Evaluating Model: {model_name}")
    llm = Ollama(model=model_name)
    
    model_results = {"model": model_name, "answers": {}}
    
    for question in questions:
        answer = get_model_answer(llm, question)
        model_results["answers"][question] = answer
        print(f"Question: {question}")
        print(f"Answer: {answer}")
        print("="*50)
    
    results.append(model_results)

# Save results to a JSON file
with open("model_responses.json", "w") as file:
    json.dump(results, file)


Evaluating Model: llama3.2:1b


  return model(query)


Question: What is Newton's first law of motion?
Answer: Newton's First Law of Motion, also known as the Law of Inertia, states that "an object at rest will remain at rest, and an object in motion will continue to move with a constant velocity, unless acted upon by an external force." This means that an object will maintain its state of motion unless a force is applied to it, causing it to change its motion.
Question: Explain the law of conservation of energy.
Answer: The law of conservation of energy is a fundamental principle in physics that states that energy cannot be created or destroyed, only converted from one form to another. This means that the total amount of energy remains constant over time, but it can change forms.

In other words, energy is conserved because it is either transferred from one object to another through work (e.g., when you run, your body converts chemical energy into kinetic energy) or transformed from one type of energy to another (e.g., thermal energy, mec

In [13]:
import time

def measure_inference_time(model, query):
    start_time = time.time()
    response = model.invoke(query)  # Replace with actual generation method
    end_time = time.time()
    return end_time - start_time




In [14]:
# List of models
models = [
    "llama3.2:1b",
    "llama3.2:3b",
    "mistral:latest",
    "llama3.1:latest",
    "gemma2:2b"
]

query = "Explain Newton's first law with an example."

# Loop through the models and measure inference time for each
for model_name in models:
    llm = Ollama(model=model_name)
    time_taken = measure_inference_time(llm, query)
    print(f"Model: {model_name}, Inference Time: {time_taken:.2f} seconds")


Model: llama3.2:1b, Inference Time: 14.13 seconds
Model: llama3.2:3b, Inference Time: 17.28 seconds
Model: mistral:latest, Inference Time: 90.12 seconds
Model: llama3.1:latest, Inference Time: 143.50 seconds
Model: gemma2:2b, Inference Time: 19.56 seconds


In [15]:
import torch

# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    print("GPU is available: ", torch.cuda.get_device_name(0))
else:
    print("GPU is not available, using CPU.")


GPU is not available, using CPU.


In [24]:
pip install psutil





In [23]:
import psutil
import time


# Function to get memory usage in GB
def get_memory_usage():
    process = psutil.Process()  # Get the current process
    memory_usage = process.memory_info().rss / (1024 ** 3)  # Convert to GB
    return memory_usage

# List of models
models = [
    "llama3.2:1b",
    "llama3.2:3b",
    "mistral:latest",
    "llama3.1:latest",
    "gemma2:2b"
]

query = "Explain Newton's first law with an example."

# Loop through the models and measure memory usage for each
for model_name in models:
    print(f"Loading model: {model_name}")
    
    # Load the model
    llm = Ollama(model=model_name)
    
    # Wait for model loading (optional, adjust as needed)
    time.sleep(2)  # Give the model some time to load and initialize
    
    # Get memory usage after loading the model
    memory_after_loading = get_memory_usage()
    
    # Run the inference
    print("Running inference...")
    llm(query)
    
    # Get memory usage after inference
    memory_after_inference = get_memory_usage()
    
    # Calculate the memory usage during inference
    memory_used = memory_after_inference - memory_after_loading
    
    print(f"Model: {model_name}")
    print(f"Memory Usage during inference: {memory_used:.2f} GB")
    print("-" * 50)


Loading model: llama3.2:1b
Running inference...
Model: llama3.2:1b
Memory Usage during inference: 0.00 GB
--------------------------------------------------
Loading model: llama3.2:3b
Running inference...
Model: llama3.2:3b
Memory Usage during inference: 0.00 GB
--------------------------------------------------
Loading model: mistral:latest
Running inference...
Model: mistral:latest
Memory Usage during inference: 0.00 GB
--------------------------------------------------
Loading model: llama3.1:latest
Running inference...
Model: llama3.1:latest
Memory Usage during inference: 0.00 GB
--------------------------------------------------
Loading model: gemma2:2b
Running inference...
Model: gemma2:2b
Memory Usage during inference: 0.00 GB
--------------------------------------------------


In [25]:
pip install memory-profiler

Collecting memory-profilerNote: you may need to restart the kernel to use updated packages.

  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.61.0


In [26]:
from memory_profiler import memory_usage

def get_model_memory_usage():
    # Load the model and run inference
    llm = Ollama(model="llama3.2:1b")
    time.sleep(2)
    result = llm("Explain Newton's first law with an example.")
    
    return result

# Measure memory usage of the function
mem_usage = memory_usage(get_model_memory_usage)
print(f"Memory usage during model inference: {max(mem_usage) - min(mem_usage):.2f} MiB")


Memory usage during model inference: 0.01 MiB


In [30]:
import os
from time import time
from memory_profiler import memory_usage
# from ollama import Ollama

# List of models
models = [
    "llama3.2:1b",
    "llama3.2:3b",
    "mistral:latest",
    "llama3.1:latest",
    "gemma2:2b"
]

query = "Explain Newton's first law with an example."

def measure_memory_usage():
    """Returns current memory usage in MB."""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 ** 2)  # Memory in MB

def run_inference(model_name, query):
    """Load model, run inference, and measure memory usage."""
    llm = Ollama(model=model_name)
    
    # Measure memory usage before loading the model
    memory_before = measure_memory_usage()
    
    # Perform inference
    start_time = time()
    response = llm(query)  # Perform the inference with the model
    inference_time = time() - start_time
    
    # Measure memory usage after inference
    memory_after_inference = measure_memory_usage()
    
    return memory_before, memory_after_inference, inference_time, response

# Loop through the models
for model_name in models:
    print(f"Model: {model_name}")
    
    memory_before, memory_after_inference, inference_time, response = run_inference(model_name, query)
    
    print(f"Memory before loading: {memory_before:.2f} MB")
    print(f"Memory after inference: {memory_after_inference:.2f} MB")
    print(f"Memory used during inference: {memory_after_inference - memory_before:.2f} MB")
    print(f"Inference Time: {inference_time:.2f} seconds")
    print(f"Response: {response}\n")


Model: llama3.2:1b
Memory before loading: 274.02 MB
Memory after inference: 274.02 MB
Memory used during inference: 0.00 MB
Inference Time: 15.35 seconds
Response: Newton's first law of motion, also known as the Law of Inertia, states that an object at rest will remain at rest, and an object in motion will continue to move with a constant velocity, unless acted upon by an external force.

Let's consider an example to illustrate this concept.

**Example: A Rolling Ball**

Imagine you have a ball that is rolling across the floor. Initially, it comes to a stop when it hits the ground. At first glance, it might seem like the ball will keep rolling indefinitely without any change in its state of motion.

However, if you try to push or pull the ball, something interesting happens. Despite your efforts, the ball continues to roll forward at a constant speed, assuming there are no external forces acting upon it.

This is an example of Newton's first law in action. The ball has a natural tenden

In [31]:
import os
import psutil
from time import time
# from ollama import Ollama

# List of models
models = [
    "llama3.2:1b",
    "llama3.2:3b",
    "mistral:latest",
    "llama3.1:latest",
    "gemma2:2b"
]

query = "Explain Newton's first law with an example."

def measure_memory_usage():
    """Returns detailed memory usage including RAM, Virtual memory, and swap."""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    # Resident Set Size (RSS) is the memory used by the process in RAM
    rss_memory = memory_info.rss / (1024 ** 2)  # in MB
    vms_memory = memory_info.vms / (1024 ** 2)  # Virtual memory in MB
    swap_memory = psutil.swap_memory().used / (1024 ** 2)  # Swap memory in MB
    return rss_memory, vms_memory, swap_memory

def run_inference(model_name, query):
    """Load model, run inference, and measure memory usage."""
    llm = Ollama(model=model_name)
    
    # Measure memory usage before loading the model
    memory_before_rss, memory_before_vms, memory_before_swap = measure_memory_usage()
    
    # Perform inference
    start_time = time()
    response = llm(query)  # Perform the inference with the model
    inference_time = time() - start_time
    
    # Measure memory usage after inference
    memory_after_rss, memory_after_vms, memory_after_swap = measure_memory_usage()
    
    return {
        "memory_before_rss": memory_before_rss,
        "memory_before_vms": memory_before_vms,
        "memory_before_swap": memory_before_swap,
        "memory_after_rss": memory_after_rss,
        "memory_after_vms": memory_after_vms,
        "memory_after_swap": memory_after_swap,
        "inference_time": inference_time,
        "response": response
    }

# Loop through the models
for model_name in models:
    print(f"Model: {model_name}")
    
    # Run inference and measure memory usage
    results = run_inference(model_name, query)
    
    print(f"Memory before loading (RSS): {results['memory_before_rss']:.2f} MB")
    print(f"Memory before loading (Virtual Memory): {results['memory_before_vms']:.2f} MB")
    print(f"Memory before loading (Swap Memory): {results['memory_before_swap']:.2f} MB")
    
    print(f"Memory after inference (RSS): {results['memory_after_rss']:.2f} MB")
    print(f"Memory after inference (Virtual Memory): {results['memory_after_vms']:.2f} MB")
    print(f"Memory after inference (Swap Memory): {results['memory_after_swap']:.2f} MB")
    
    print(f"Inference Time: {results['inference_time']:.2f} seconds")
    print(f"Response: {results['response']}\n")


Model: llama3.2:1b
Memory before loading (RSS): 274.03 MB
Memory before loading (Virtual Memory): 502.02 MB
Memory before loading (Swap Memory): 55.48 MB
Memory after inference (RSS): 279.08 MB
Memory after inference (Virtual Memory): 506.64 MB
Memory after inference (Swap Memory): 55.48 MB
Inference Time: 13.62 seconds
Response: Newton's First Law of Motion, also known as the Law of Inertia, states that an object at rest will remain at rest, and an object in motion will continue to move with a constant velocity, unless acted upon by an external force.

Here's an example to illustrate this concept:

Imagine you're on a train, and you throw a ball straight up in the air. What happens? The ball comes down and lands in your hand, right?

Now, let's say someone pushes you gently against the wall with their hands. What happens? The momentum of your body is transferred to the wall, causing it to move backward.

In both cases, there was an external force (the person pushing or the train accel

In [32]:
import time

# Assuming llm_model is already defined
# List of models
models = [
    "llama3.2:1b",
    "llama3.2:3b",
    "mistral:latest",
    "llama3.1:latest",
    "gemma2:2b"
]

query = "Explain Newton's first law with an example."

# Function to calculate token generation speed
def token_generation_speed(model, query):
    start_time = time.time()
    response = model(query)  # Call the model's inference method
    end_time = time.time()
    tokens = len(response.split())  # Assuming token == word; use proper tokenization if needed
    time_taken = end_time - start_time
    return tokens / time_taken

# Loop through the models and measure token generation speed for each
for model_name in models:
    print(f"Model: {model_name}")
    
    # Load the model
    llm = Ollama(model=model_name)
    
    # Calculate token generation speed
    speed = token_generation_speed(llm, query)
    print(f"Token Generation Speed: {speed:.2f} tokens/sec\n")


Model: llama3.2:1b
Token Generation Speed: 21.83 tokens/sec

Model: llama3.2:3b
Token Generation Speed: 13.01 tokens/sec

Model: mistral:latest
Token Generation Speed: 2.56 tokens/sec

Model: llama3.1:latest
Token Generation Speed: 1.90 tokens/sec

Model: gemma2:2b
Token Generation Speed: 14.58 tokens/sec

