In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# 1. Load Pretrained Model and Tokenizer
# We will use 'gpt2', a medium-sized decoder-only model.
# The `GPT2LMHeadModel` is specifically designed for language modeling (text generation).
# The first time you run this, it will download the model and tokenizer, which may take a few minutes.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model to evaluation mode. This is important for consistent and reproducible results.
model.eval()

# 2. Prepare the Input
# This is the text prompt from which the model will start generating.
input_prompt = "The future of artificial intelligence is"

# Tokenize the input prompt. The tokenizer converts the text into a sequence of numbers (input IDs)
# that the model can understand.
# `return_tensors='pt'` ensures the output is a PyTorch tensor.
input_ids = tokenizer.encode(input_prompt, return_tensors='pt')

# 3. Perform Inference (Generate Output)
# The `generate` method creates a sequence of tokens following the input prompt.
# - `max_length`: The maximum length of the generated sequence (including the prompt).
# - `num_return_sequences`: The number of different sequences to generate.
# - `no_repeat_ngram_size`: Prevents the model from repeating the same n-grams.
# - `pad_token_id`: Sets the padding token ID to the end-of-sequence token ID for open-ended generation.
with torch.no_grad(): # Disable gradient calculation for inference to save memory and computations
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=50,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id
    )

# 4. Decode the Output
# The tokenizer is used again to convert the generated token IDs back into human-readable text.
# `skip_special_tokens=True` removes any special tokens (like padding or end-of-sequence) from the output.
generated_sequence = output_sequences[0].tolist()
decoded_output = tokenizer.decode(generated_sequence, skip_special_tokens=True)

# 5. Display Input and Output
print("--- Input ---")
print(input_prompt)
print("\n--- Output ---")
print(decoded_output)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


--- Input ---
The future of artificial intelligence is

--- Output ---
The future of artificial intelligence is uncertain.

"We're not sure what the future will look like," said Dr. Michael S. Schoenfeld, a professor of computer science at the University of California, Berkeley. "But we're very


In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# 1. Load Pretrained Model and Tokenizer
# It's crucial to load the model with `torchscript=True`.
# This flag correctly handles the tied weights between the embedding and language model head,
# which is necessary for successful tracing of models like GPT-2.
print("Loading pretrained GPT-2 model and tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', torchscript=True)

# Set the model to evaluation mode. This disables dropout and other training-specific layers.
model.eval()

# 2. Create a Dummy Input
# The trace needs an example input to run through the model and record the operations.
# The dimensions (batch_size, sequence_length) of this dummy input will be fixed in the trace.
# Any future inputs to the traced model must have these exact dimensions.
batch_size = 1
sequence_length = 10  # You can choose a representative sequence length
dummy_input = torch.randint(0, tokenizer.vocab_size, (batch_size, sequence_length))

print(f"\nCreated a dummy input tensor with shape: {dummy_input.shape}")

# 3. Trace the Model
# We use `torch.jit.trace` to record a single forward pass of the model.
# This creates a `ScriptModule` containing the traced computation graph.
print("Tracing the model... (This may take a moment)")
with torch.no_grad():
    traced_model = torch.jit.trace(model, dummy_input)

# 4. Save the Traced Model
# The traced model is saved to a file. This file can be loaded later
# in Python or in other environments like a C++ application.
trace_file_path = "traced_gpt2_model.pt"
traced_model.save(trace_file_path)

print(f"\nTrace file successfully created and saved to: {trace_file_path}")

# --- Optional: Verify the Traced Model ---
print("\nVerifying the traced model by loading it and running inference...")

# Load the saved trace file
loaded_traced_model = torch.jit.load(trace_file_path)

# Run the same dummy input through the loaded model
with torch.no_grad():
    outputs_from_traced_model = loaded_traced_model(dummy_input)
    # The output of GPT2LMHeadModel is a tuple, where the first element is the logits
    logits_from_traced_model = outputs_from_traced_model[0]

print("Verification successful!")
print(f"Output logits shape from traced model: {logits_from_traced_model.shape}")

Loading pretrained GPT-2 model and tokenizer...


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.



Created a dummy input tensor with shape: torch.Size([1, 10])
Tracing the model... (This may take a moment)


  if sequence_length != 1:



Trace file successfully created and saved to: traced_gpt2_model.pt

Verifying the traced model by loading it and running inference...
Verification successful!
Output logits shape from traced model: torch.Size([1, 10, 50257])


In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import threading
import time
from pynvml import *

# --- Monitoring Thread Function ---
def monitor_gpu_stats(stop_event, stats):
    """
    Monitors GPU stats in a separate thread and records them.
    """
    try:
        nvmlInit()
        # Assuming you are using the first GPU (index 0)
        handle = nvmlDeviceGetHandleByIndex(0)
        
        power_readings = []
        temp_readings = []
        util_readings = []

        while not stop_event.is_set():
            # Get power usage in Watts
            power_usage = nvmlDeviceGetPowerUsage(handle) / 1000.0  # Convert from mW to W
            power_readings.append(power_usage)

            # Get temperature in Celsius
            temperature = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)
            temp_readings.append(temperature)

            # Get utilization rates
            utilization = nvmlDeviceGetUtilizationRates(handle)
            util_readings.append(utilization.gpu)
            
            time.sleep(0.1) # Poll every 100ms
            
        # Store results
        stats['power_avg_w'] = sum(power_readings) / len(power_readings) if power_readings else 0
        stats['power_peak_w'] = max(power_readings) if power_readings else 0
        stats['temp_avg_c'] = sum(temp_readings) / len(temp_readings) if temp_readings else 0
        stats['temp_peak_c'] = max(temp_readings) if temp_readings else 0
        stats['util_avg_percent'] = sum(util_readings) / len(util_readings) if util_readings else 0
        stats['util_peak_percent'] = max(util_readings) if util_readings else 0

    finally:
        nvmlShutdown()

# --- Main Inference Code ---
# 1. Load Model and Tokenizer
print("Loading model...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Move model to GPU if available
if torch.cuda.is_available():
    model.to('cuda')
    print("Model moved to GPU.")

# 2. Prepare Input
input_prompt = "The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI,"
input_ids = tokenizer.encode(input_prompt, return_tensors='pt')
if torch.cuda.is_available():
    input_ids = input_ids.to('cuda')

# 3. Start Monitoring and Perform Inference
stats = {}
stop_event = threading.Event()

# Start the monitoring thread only if a CUDA device is found
if torch.cuda.is_available():
    monitor_thread = threading.Thread(target=monitor_gpu_stats, args=(stop_event, stats))
    monitor_thread.start()

print("\nStarting inference...")
inference_start_time = time.time()

with torch.no_grad():
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=150,
        num_return_sequences=1
    )

inference_duration = time.time() - inference_start_time
print("Inference complete.") 

# Stop the monitoring thread
if torch.cuda.is_available():
    stop_event.set()
    monitor_thread.join()

# 4. Decode and Display Results
decoded_output = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

print("\n--- Model Output ---")
print(decoded_output)

print(f"\n--- Performance and Power Stats ---")
print(f"Inference Time: {inference_duration:.2f} seconds")
if stats:
    print(f"Average GPU Power Draw: {stats['power_avg_w']:.2f} W")
    print(f"Peak GPU Power Draw:    {stats['power_peak_w']:.2f} W")
    print(f"Average GPU Temp:       {stats['temp_avg_c']:.1f}°C")
    print(f"Peak GPU Temp:          {stats['temp_peak_c']:.1f}°C")
    print(f"Average GPU Utilization:  {stats['util_avg_percent']:.1f}%")
    print(f"Peak GPU Utilization:     {stats['util_peak_percent']:.1f}%")
else:
    print("No NVIDIA GPU detected. Power stats are not available with this method.")

Loading model...


Exception in thread Thread-3 (monitor_gpu_stats):
Traceback (most recent call last):
  File "C:\Users\vaibh\anaconda3\envs\pytorch\Lib\site-packages\pynvml.py", line 641, in _LoadNvmlLibrary
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
    nvmlLib = CDLL(os.path.join(os.getenv("ProgramFiles", "C:/Program Files"), "NVIDIA Corporation/NVSMI/nvml.dll"))
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\vaibh\anaconda3\envs\pytorch\Lib\ctypes\__init__.py", line 379, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: Could not find module 'C:\Program Files\NVIDIA Corporation\NVSMI\nvml.dll' (or one of its dependencies). Try using the full path with constructor syntax.

During handling of the above exception,

Model moved to GPU.

Starting inference...
Inference complete.

--- Model Output ---
The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The Future of Artificial Intelligence

The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The future of artificial intelligence is a complex topic with many facets to consider

--- Performance and Power Stats ---
Inference Time: 4.95 seconds
No NVIDIA GPU detected. Power stats are not available with this method.


In [4]:
pip install codecarbon

Collecting codecarbon
  Downloading codecarbon-3.0.2-py3-none-any.whl.metadata (9.1 kB)
Collecting click (from codecarbon)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting fief-client[cli] (from codecarbon)
  Downloading fief_client-0.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting psutil>=6.0.0 (from codecarbon)
  Using cached psutil-7.0.0-cp37-abi3-win_amd64.whl.metadata (23 kB)
Collecting py-cpuinfo (from codecarbon)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting pydantic (from codecarbon)
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting questionary (from codecarbon)
  Downloading questionary-2.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting rapidfuzz (from codecarbon)
  Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl.metadata (12 kB)
Collecting rich (from codecarbon)
  Downloading rich-14.0.0-py3-none-any.whl.metadata (18 kB)
Collecting typer (from codecarbon)
  Downloading typer-0.16.0-py

In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from codecarbon import track_emissions

# The @track_emissions decorator will automatically measure the
# energy and carbon footprint of this function.
@track_emissions(project_name="GPT2_Inference")
def run_gpt2_inference():
    """
    Loads the GPT-2 model and performs a single inference task.
    """
    # 1. Load Model and Tokenizer
    print("Loading model...")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Move model to GPU if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    print(f"Model moved to {device}.")

    # 2. Prepare Input
    input_prompt = "The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI,"
    input_ids = tokenizer.encode(input_prompt, return_tensors='pt').to(device)

    # 3. Perform Inference
    print("\nStarting inference...")
    with torch.no_grad():
        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=150,
            num_return_sequences=1
        )
    print("Inference complete.")

    # 4. Decode Output
    decoded_output = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    
    print("\n--- Model Output ---")
    print(decoded_output)


# --- Run the tracked function ---
if __name__ == "__main__":
    run_gpt2_inference()
    print("\nMeasurement complete. Check the 'emissions.csv' file for power and carbon stats.")

[codecarbon INFO @ 15:04:30] [setup] RAM Tracking...
[codecarbon INFO @ 15:04:30] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 15:04:33] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i7-12650H
[codecarbon INFO @ 15:04:33] [setup] GPU Tracking...
[codecarbon INFO @ 15:04:33] No GPU found.
[codecarbon INFO @ 15:04:33] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 15:04:33] >>> Tracker's metadata:
[codecarbon INFO @ 15:04:33]   Platform system: Windows-11-10.0.26200-SP0
[codecarbon INFO @ 15:04:33]   Python version: 3.12.9
[codecarbon INFO @ 15:04:33]   CodeCarbon version: 3.0.2
[codecarbon INFO @ 15:04:33]   Available RAM : 15.676 GB
[codecarbon INFO @ 15:04:33]   CPU count: 16 thread(s) in 16 physical CP

Loading model...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model moved to cuda.

Starting inference...


[codecarbon INFO @ 15:04:40] 
Graceful stopping: collecting and writing information.
Please wait a few seconds...
[codecarbon INFO @ 15:04:40] Energy consumed for RAM : 0.000019 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 15:04:40] Delta energy consumed for CPU with constant : 0.000080 kWh, power : 42.5 W
[codecarbon INFO @ 15:04:40] Energy consumed for All CPU : 0.000080 kWh
[codecarbon INFO @ 15:04:40] 0.000098 kWh of electricity used since the beginning.
[codecarbon INFO @ 15:04:40] Done!



Inference complete.

--- Model Output ---
The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The Future of Artificial Intelligence

The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The future of artificial intelligence is a complex topic with many facets to consider

Measurement complete. Check the 'emissions.csv' file for power and carbon stats.


In [7]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import threading
import time
import sys

try:
    from pynvml import *
except ImportError:
    print("Error: 'pynvml' library not found. Please install it using 'pip install pynvml'")
    sys.exit(1)


# --- Step 1: Verify CUDA Setup ---
if not torch.cuda.is_available():
    print("--------------------------------------------------------------------------")
    print("PyTorch cannot find a CUDA-enabled GPU.")
    print("Please check the following:")
    print("1. You have an NVIDIA GPU.")
    print("2. The latest NVIDIA drivers are installed.")
    print("3. You have installed PyTorch with CUDA support. See https://pytorch.org/")
    print("--------------------------------------------------------------------------")
    sys.exit(1)


# --- Monitoring Thread Function (Corrected) ---
def monitor_gpu_stats(stop_event, stats_dict):
    """Monitors GPU stats in a separate thread and records them."""
    try:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        
        power_readings = []
        temp_readings = []
        util_readings = []

        while not stop_event.is_set():
            power_readings.append(nvmlDeviceGetPowerUsage(handle) / 1000.0) # Watts
            temp_readings.append(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
            util_readings.append(nvmlDeviceGetUtilizationRates(handle).gpu)
            time.sleep(0.1)
            
        # --- FIX STARTS HERE ---
        # Safely calculate stats, handling the case where lists might be empty
        # if the inference was too fast for the monitor to collect data.
        stats_dict['power_avg_w'] = sum(power_readings) / len(power_readings) if power_readings else 0
        stats_dict['power_peak_w'] = max(power_readings) if power_readings else 0
        stats_dict['temp_avg_c'] = sum(temp_readings) / len(temp_readings) if temp_readings else 0
        stats_dict['temp_peak_c'] = max(temp_readings) if temp_readings else 0
        stats_dict['util_avg_percent'] = sum(util_readings) / len(util_readings) if util_readings else 0
        stats_dict['util_peak_percent'] = max(util_readings) if util_readings else 0
        # --- FIX ENDS HERE ---

    except NVMLError as error:
        print(f"NVML Error in monitoring thread: {error}")
    finally:
        nvmlShutdown()

# --- Main Inference Code ---
device = torch.device("cuda")
print(f"CUDA is available. Using device: {torch.cuda.get_device_name(0)}")

print("Loading model and moving to GPU...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)
model.eval()

input_prompt = "The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI,"
input_ids = tokenizer.encode(input_prompt, return_tensors='pt').to(device)

stats = {}
stop_event = threading.Event()

monitor_thread = threading.Thread(target=monitor_gpu_stats, args=(stop_event, stats))
monitor_thread.start()

print("\nStarting inference on GPU...")
inference_start_time = time.time()

with torch.no_grad():
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=150,
        num_return_sequences=1
    )

inference_duration = time.time() - inference_start_time
print("Inference complete.")

stop_event.set()
monitor_thread.join()

decoded_output = tokenizer.decode(output_sequences[0].cpu().tolist(), skip_special_tokens=True)

print("\n--- Model Output ---")
print(decoded_output)

print(f"\n--- GPU Performance and Power Stats ---")
print(f"Inference Time: {inference_duration:.2f} seconds")

# --- FIX STARTS HERE ---
# Use the .get() method on the dictionary for safe access. This provides a
# default value (0) if the key doesn't exist for any reason, preventing a KeyError.
print(f"Average GPU Power Draw: {stats.get('power_avg_w', 0):.2f} W")
print(f"Peak GPU Power Draw:    {stats.get('power_peak_w', 0):.2f} W")
print(f"Average GPU Temp:       {stats.get('temp_avg_c', 0):.1f}°C")
print(f"Peak GPU Temp:          {stats.get('temp_peak_c', 0):.1f}°C")
print(f"Average GPU Utilization:  {stats.get('util_avg_percent', 0):.1f}%")
print(f"Peak GPU Utilization:     {stats.get('util_peak_percent', 0):.1f}%")
# --- FIX ENDS HERE ---

CUDA is available. Using device: NVIDIA GeForce RTX 3050 Laptop GPU
Loading model and moving to GPU...


Exception in thread Thread-17 (monitor_gpu_stats):
Traceback (most recent call last):
  File "C:\Users\vaibh\anaconda3\envs\pytorch\Lib\threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "C:\Users\vaibh\anaconda3\envs\pytorch\Lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
    _threading_Thread_run(self)
  File "C:\Users\vaibh\anaconda3\envs\pytorch\Lib\threading.py", line 1012, in run
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
    self._target(*self._args, **self._kwargs)
  File "C:\Users\vaibh\AppData\Local\Temp\ipykernel_11788\4099947937.py", line 57, in monitor_gpu_stats
  File "C:\Users\vaibh\anaconda3\envs\pytorch\Lib\site-packages\pynvml.py", line 657, in nvmlShutdown
    fn = _nvmlGetFunctionPointer("nvmlShutdown")
         ^^^^^^^^^^^^^^


Starting inference on GPU...NVML Error in monitoring thread: NVML Shared Library Not Found

Inference complete.

--- Model Output ---
The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The Future of Artificial Intelligence

The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The future of artificial intelligence is a complex topic with many facets to consider. As we develop more advanced AI, we will need to consider the potential of AI to solve many of the problems we face today.

The future of artificial intelligence is a complex topic with many facets to consider

--- GPU Performance and Power Stats ---
Inference Time: 3.30 seconds
Average GPU Power Draw: 0.00 W
Peak GPU 