# Utils Functions

In [1]:
from ipywidgets import widgets
from IPython.display import display
from pprint import pprint


def show_scrollable(content, height="300px"):
    """
    Display content in a scrollable box in Jupyter Notebook.

    Args:
        content: Content to display (string, list, dict, or any printable object)
        height: Height of the scrollable box (e.g., '300px', '500px', '50%')

    Examples:
        >>> # Display a long list
        >>> show_scrollable([f"Item {i}" for i in range(100)])

        >>> # Display a dictionary with custom height
        >>> big_dict = {i: f"Value {i}" for i in range(50)}
        >>> show_scrollable(big_dict, height='400px')

        >>> # Display string output
        >>> show_scrollable("Lorem ipsum...\\n" * 50)
    """
    out = widgets.Output(
        layout={
            "height": height,
            "overflow": "auto",
            "border": "1px solid #ddd",
            "padding": "5px",
        }
    )
    display(out)

    with out:
        if isinstance(content, (list, dict)):
            pprint(content)
        else:
            print(content) if content is not None else print("None")

# Inspecting DS R1 Qwen Model

In [2]:
from transformers import AutoModelForCausalLM, AutoConfig

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

print(config)  # Full architecture details

Qwen2Config {
  "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 131072,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.0",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}



In [3]:
from transformers import AutoModelForCausalLM
from torchinfo import summary
import torch

# Load models
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

summary(model, input_size=(1, 128), dtypes=[torch.int64])

Layer (type:depth-idx)                             Output Shape              Param #
Qwen2ForCausalLM                                   [1, 2, 128, 128]          --
├─Qwen2Model: 1-1                                  [1, 2, 128, 128]          --
│    └─Embedding: 2-1                              [1, 128, 1536]            233,373,696
│    └─Qwen2RotaryEmbedding: 2-2                   [1, 128, 128]             --
│    └─ModuleList: 2-3                             --                        --
│    │    └─Qwen2DecoderLayer: 3-1                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-2                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-3                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-4                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-5                 [1, 128, 1536]            46,797,824
│    │    └─Qwen2DecoderLayer: 3-6                 [1, 128, 1536] 

In [None]:
from transformers import AutoModelForCausalLM

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

for name, module in model.named_modules():
    # if "layers" in name and "proj" in name:
    print(name, module)

In [4]:
from transformers import AutoModelForCausalLM, AutoConfig

original_config = AutoConfig.from_pretrained("Qwen/Qwen1.5-1.8B")
print(original_config)  # Compare hidden_size, layers, etc.

Qwen2Config {
  "_name_or_path": "Qwen/Qwen1.5-1.8B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}



## Inspect Weights

In [19]:
from transformers import AutoModelForCausalLM
import torch.nn as nn

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)


def inspect_layer(name, module):
    """Helper function to inspect a single layer"""
    print(f"\n--- Layer: {name} ---")
    print(f"Type: {type(module)}")

    # Check if it's a standard linear layer
    if isinstance(module, nn.Linear):
        print(f"  Weight shape: {module.weight.shape} (out_features, in_features)")
        if module.bias is not None:
            print(f"  Bias shape: {module.bias.shape}")
        print(f"  First weight values (3x3):\n{module.weight.data[:3, :3]}")

    # Check for common patterns in transformer layers
    if "q_proj" in name:
        print("  [This appears to be a query projection]")
    elif "k_proj" in name:
        print("  [This appears to be a key projection]")
    elif "v_proj" in name:
        print("  [This appears to be a value projection]")
    elif "o_proj" in name:
        print("  [This appears to be an output projection]")
    elif "gate_proj" in name or "up_proj" in name or "down_proj" in name:
        print("  [This appears to be a feed-forward layer component]")


# Iterate through all layers
for name, module in model.named_modules():
    # Skip very high-level modules to reduce output
    if len(name.split(".")) > 6:  # Adjust this number as needed
        continue

    # Only inspect certain types of layers
    if isinstance(module, nn.Linear) or "proj" in name or "attention" in name:
        inspect_layer(name, module)

# Additional inspection of the first layer's weights
print("\n=== Detailed First Layer Inspection ===")
for name, param in model.named_parameters():
    if "layers.0" in name and "weight" in name:
        print(f"\nParameter: {name}")
        print(f"Shape: {param.shape}")
        print(
            f"First few values:\n{param.data[:2, :5] if len(param.shape) > 1 else param.data[:5]}"
        )
        break


--- Layer: model.layers.0.self_attn.q_proj ---
Type: <class 'torch.nn.modules.linear.Linear'>
  Weight shape: torch.Size([1536, 1536]) (out_features, in_features)
  Bias shape: torch.Size([1536])
  First weight values (3x3):
tensor([[-0.0300,  0.0226,  0.0251],
        [-0.0177, -0.0050,  0.0713],
        [-0.0033, -0.0170,  0.0043]])
  [This appears to be a query projection]

--- Layer: model.layers.0.self_attn.k_proj ---
Type: <class 'torch.nn.modules.linear.Linear'>
  Weight shape: torch.Size([256, 1536]) (out_features, in_features)
  Bias shape: torch.Size([256])
  First weight values (3x3):
tensor([[-0.0645,  0.0148, -0.1377],
        [ 0.0254, -0.0625,  0.0957],
        [ 0.0068, -0.0386, -0.0035]])
  [This appears to be a key projection]

--- Layer: model.layers.0.self_attn.v_proj ---
Type: <class 'torch.nn.modules.linear.Linear'>
  Weight shape: torch.Size([256, 1536]) (out_features, in_features)
  Bias shape: torch.Size([256])
  First weight values (3x3):
tensor([[ 0.0123, -0

# LLAMA3 Models

In [1]:
from transformers import AutoModelForCausalLM, AutoConfig

original_config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-8B")
print(original_config)  # Compare hidden_size, layers, etc.

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B.
401 Client Error. (Request ID: Root=1-683353fb-51039aed3df2f93708f1d9e0;d16a0d44-2fe3-471d-beec-565097c2ccd1)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
from transformers import AutoModelForCausalLM, AutoConfig

original_config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-70B")
print(original_config)  # Compare hidden_size, layers, etc.

Qwen2Config {
  "_name_or_path": "Qwen/Qwen1.5-1.8B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}



# DeepSeek V3 Models

In [2]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("deepseek-ai/deepseek-v3")
print(config)  # Compare hidden_size, layers, etc.

ValueError: Loading deepseek-ai/deepseek-v3 requires you to execute the configuration file in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.

In [None]:
from transformers import AutoModelForCausalLM
from torchinfo import summary
import torch

# Load models
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

summary(model, input_size=(1, 128), dtypes=[torch.int64])

# Inspect Layer Files

In [1]:
import torch
from pathlib import Path
import matplotlib.pyplot as plt


def inspect_quantized_file(file_path: Path, sample_size: int = 5):
    """
    Inspects a quantized model file (.pt) and prints key information.

    Args:
        file_path: Path to the .pt file
        sample_size: Number of elements to display for sampling
    """
    try:
        # Load data
        data = torch.load(file_path)
        print(f"\n{'='*50}\nInspecting: {file_path.name}\n{'='*50}")

        # Basic info
        print(f"[Type] {type(data)}")

        if isinstance(data, torch.Tensor):
            # Handle raw tensor case
            print("\n[Raw Tensor]")
            print(f"Shape: {data.shape}")
            print(f"dtype: {data.dtype}")
            print(f"Device: {data.device}")

            # Statistics
            if data.dtype in (torch.float16, torch.float32, torch.bfloat16):
                data_float = data.float()
                print(f"\n[Statistics]")
                print(f"Min: {data_float.min().item():.4f}")
                print(f"Max: {data_float.max().item():.4f}")
                print(f"Mean: {data_float.mean().item():.4f}")
                print(f"Std: {data_float.std().item():.4f}")

            # Sample values
            print(f"\n[Sample Values (first {sample_size} elements)]")
            print(data.flatten()[:sample_size].tolist())

            # Plot histogram if reasonable size
            if data.numel() < 1e6:  # Don't plot for huge tensors
                plt.figure(figsize=(10, 4))
                plt.hist(data.float().cpu().numpy().flatten(), bins=50)
                plt.title(f"Value Distribution - {file_path.name}")
                plt.xlabel("Value")
                plt.ylabel("Frequency")
                plt.show()

        elif isinstance(data, dict):
            # Handle quantized weight dictionary
            print("\n[Quantized Weight Structure]")
            print("Keys:", list(data.keys()))

            # Required fields
            qweight = data["weight"]
            print(f"\n[Weight Tensor]")
            print(f"Shape: {qweight.shape}")
            print(f"dtype: {qweight.dtype}")
            print(f"Device: {qweight.device}")

            # Sample values
            print(f"\n[Sample Weight Values (first {sample_size} elements)]")
            print(qweight.flatten()[:sample_size].tolist())

            # Check for quantization parameters
            if "scales" in data:
                scales = data["scales"]
                print(f"\n[Scales]")
                print(f"Shape: {scales.shape}")
                print(f"Min: {scales.min().item():.4f}")
                print(f"Max: {scales.max().item():.4f}")
                print(f"Sample: {scales.flatten()[:sample_size].tolist()}")

            if "zeros" in data:
                zeros = data["zeros"]
                print(f"\n[Zeros]")
                print(f"Shape: {zeros.shape}")
                print(f"Sample: {zeros.flatten()[:sample_size].tolist()}")

            if "bias" in data:
                bias = data["bias"]
                print(f"\n[Bias]")
                print(f"Shape: {bias.shape}")
                print(f"Sample: {bias.flatten()[:sample_size].tolist()}")

            # Special handling for packed 4-bit weights
            if qweight.dtype == torch.int32:
                print("\n[4-bit Packed Weights]")
                packed_val = qweight[0, 0].item()
                unpacked = [(packed_val >> (4 * i)) & 0xF for i in range(8)]
                print(f"First packed int32: {packed_val} → Unpacked 4-bit: {unpacked}")

        else:
            print("\n[Unknown data format]")
            print(data)

    except Exception as e:
        print(f"\n[ERROR] Failed to inspect {file_path}: {str(e)}")


def main():
    # Configure these paths
    base_dir = Path("/home/xzhang/models/deepseek-awq-scrooge/quantized_layers")
    target_file = "model.layers.0.self_attn.k_proj.pt"  # Or use *.pt to process all

    # Single file inspection
    inspect_quantized_file(base_dir / target_file)

    # Uncomment to process all .pt files in directory
    # for pt_file in base_dir.glob("*.pt"):
    #     inspect_quantized_file(pt_file)


if __name__ == "__main__":
    main()


Inspecting: model.layers.0.self_attn.k_proj.pt
[Type] <class 'dict'>

[Quantized Weight Structure]
Keys: ['qweight', 'qzeros', 'scales', 'bias']

[ERROR] Failed to inspect /home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.0.self_attn.k_proj.pt: 'weight'


In [5]:
import torch
from pathlib import Path

base_dir = Path("/home/xzhang/models/deepseek-awq-scrooge/quantized_layers")
print("Available files:", list(base_dir.glob("*")))

layer_file = "model.layers.0.self_attn.q_proj.pt"  # Change as needed
target_file = base_dir / layer_file
data = torch.load(target_file)

print("-" * 40)

Available files: [PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.19.mlp.up_proj.pt'), PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.27.self_attn.q_proj.pt'), PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.2.self_attn.k_proj.pt'), PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.20.self_attn.v_proj.pt'), PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.8.self_attn.v_proj.pt'), PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.12.self_attn.q_proj.pt'), PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.22.self_attn.q_proj.pt'), PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.8.self_attn.k_proj.pt'), PosixPath('/home/xzhang/models/deepseek-awq-scrooge/quantized_layers/model.layers.18.self_attn.v_proj.pt'), PosixPath('/home/x

In [7]:
import torch
from pathlib import Path

base_dir = Path("/home/xzhang/models/deepseek-awq-scrooge/quantized_layers")

layer_file = "model.layers.9.mlp.down_proj.pt"  # Change as needed
target_file = base_dir / layer_file
data = torch.load(target_file)


if isinstance(data, torch.Tensor):
    print(f"Tensor shape: {data.shape}")
    print(f"Tensor dtype: {data.dtype}")
    print("\nValues:")
    print(data)

elif isinstance(data, dict):
    print("Dictionary contents:")
    for key, value in data.items():
        print(f"\n{key}:")
        if isinstance(value, torch.Tensor):
            print(f"Shape: {value.shape}")
            print(f"Dtype: {value.dtype}")
            if value.numel() <= 10:  # Print full tensor if small
                print("Values:")
                print(value)
            else:
                print(
                    "First few values:" if len(value.shape) == 1 else "First few rows:"
                )
                print(value[:5] if len(value.shape) == 1 else value[:5, :5])
        else:
            print(value)
else:
    print("Unknown data type:")
    print(data)

Dictionary contents:

qweight:
Shape: torch.Size([8960, 192])
Dtype: torch.int32
First few rows:
tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]], dtype=torch.int32)

qzeros:
Shape: torch.Size([70, 192])
Dtype: torch.int32
First few rows:
tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]], dtype=torch.int32)

scales:
Shape: torch.Size([70, 1536])
Dtype: torch.float16
First few rows:
tensor([[4.6250e+00, 3.6875e+00, 6.1562e+00, 3.7656e+00, 3.3750e+00],
        [7.7820e-03, 8.0566e-03, 5.9509e-03, 9.2163e-03, 6.8054e-03],
        [1.0014e-04, 1.0014e-04, 1.0014e-04, 1.0014e-04, 1.0014e-04],
        [1.7452e-04, 2.8038e-04, 2.6894e-04, 1.7262e-04, 2.5749e-04],
        [1.0014e-04, 1.0014e-04, 1.0014e-04, 1.0014e-04, 1.0014e-04]],
       dtype=torch.float16)


## Qweights

In [None]:
import os
import torch
from pathlib import Path


def inspect_qweights_in_dir(layer_dir: Path, zero_threshold: float = 0.9):
    """
    Inspect qweight statistics for all saved quantized layer files in a directory.

    Args:
        layer_dir (Path): Directory containing *.pt quantized layer files.
        zero_threshold (float): Warn if percentage of zeros in qweight exceeds this.
    """
    layer_dir = Path(layer_dir).expanduser()
    if not layer_dir.exists():
        raise FileNotFoundError(f"Directory not found: {layer_dir}")

    pt_files = sorted(layer_dir.glob("*.pt"))
    if not pt_files:
        print("❌ No .pt layer files found in directory.")
        return

    print(f"\n🔍 Inspecting {len(pt_files)} layer files in: {layer_dir}\n")

    for f in pt_files:
        try:
            state_dict = torch.load(f, map_location="cpu")
            qweight = state_dict.get("qweight", None)

            if qweight is None:
                print(f"⚠️  {f.name}: Missing `qweight` key.")
                continue

            zeros = (qweight == 0).sum().item()
            total = qweight.numel()
            zero_pct = zeros / total

            unique_vals = torch.unique(qweight)
            preview_vals = unique_vals.tolist()[:10]

            flag = "⚠️ HIGH ZERO RATIO!" if zero_pct > zero_threshold else "✅"

            # ✅ FIX: Convert tuple to string before formatting
            shape_str = str(tuple(qweight.shape))

            print(
                f"{f.name:<40} | shape: {shape_str:<20} | "
                f"zeros: {zero_pct:.2%} | unique[:10]: {preview_vals} {flag}"
            )
            print()
        except Exception as e:
            print(f"❌ {f.name}: Error loading or parsing file — {str(e)}")


if __name__ == "__main__":
    # Example usage — adjust this path if needed
    quantized_dir = Path("~/models/deepseek-awq-scrooge/quantized_layers")
    inspect_qweights_in_dir(quantized_dir)


🔍 Inspecting 168 layer files in: /home/xzhang/models/deepseek-awq-scrooge/quantized_layers

model.layers.0.mlp.down_proj.pt          | shape: (8960, 192)          | zeros: 31.48% | unique[:10]: [-2147482912, -2147477591, -2147474320, -2147471587, -2147471339, -2147469824, -2147355904, -2147305013, -2147254939, -2147245090] ✅
model.layers.0.mlp.gate_proj.pt          | shape: (1536, 1120)         | zeros: 20.27% | unique[:10]: [-2147483648, -2147483551, -2147483522, -2147483493, -2147483076, -2147483016, -2147482477, -2147481730, -2147480688, -2147476564] ✅
model.layers.0.mlp.up_proj.pt            | shape: (1536, 1120)         | zeros: 38.93% | unique[:10]: [-2147483648, -2147483606, -2147483536, -2147483449, -2147482695, -2147482592, -2147481602, -2147474684, -2147473856, -2147471600] ✅
model.layers.0.self_attn.k_proj.pt       | shape: (1536, 32)           | zeros: 7.35% | unique[:10]: [-2147200187, -2130016890, -2121405444, -2095701717, -2093492976, -2089025545, -2047559711, -20220251

In [33]:
import torch
from pathlib import Path

# Path to the file
base_dir = Path("/home/xzhang/models/deepseek-awq-scrooge/quantized_layers")
layer_file = "model.layers.0.self_attn.q_proj.pt"
target_file = base_dir / layer_file

# Load the file
data = torch.load(target_file, map_location="cpu")

print(f"\nContents of {layer_file}:")

if isinstance(data, dict):
    for key, value in data.items():
        if isinstance(value, torch.Tensor):
            print(f"  {key:<10} → shape: {tuple(value.shape)}, dtype: {value.dtype}")
        else:
            print(f"  {key:<10} → type: {type(value).__name__}")
else:
    print("File content is not a dict.")


Contents of model.layers.0.self_attn.q_proj.pt:
  qweight    → shape: (1536, 192), dtype: torch.int32
  qzeros     → shape: (12, 192), dtype: torch.int32
  scales     → shape: (12, 1536), dtype: torch.float16
  bias       → shape: (1536,), dtype: torch.float16


In [23]:
# check_quantization_efficiency.py
import torch
from pathlib import Path
import matplotlib.pyplot as plt


def analyze_quantization(file_path):
    data = torch.load(file_path)

    if not isinstance(data, dict):
        print("Error: Expected quantized layer dictionary")
        return

    print(f"\nAnalyzing {file_path.name}")
    print("-" * 50)

    # Extract parameters
    qweight = data["qweight"]  # shape: [in_features, out_features//8]
    scales = data["scales"]  # shape: [in_features//group_size, out_features]
    qzeros = data["qzeros"]  # shape: [in_features//group_size, out_features//8]

    group_size = qweight.shape[0] // scales.shape[0]
    print(f"Group size: {group_size}")
    print(f"Scales range: {scales.min().item():.4f} - {scales.max().item():.4f}")

    # Properly expand dimensions for broadcasting
    scales = scales.view(
        -1, group_size, scales.shape[-1]
    )  # [groups, group_size, out_features]
    scales = scales.transpose(1, 2)  # [groups, out_features, group_size]
    scales = scales.reshape(-1, scales.shape[-1])  # [groups*out_features, group_size]

    qzeros = qzeros.view(-1, 1, qzeros.shape[-1])  # [groups, 1, out_features//8]
    qzeros = qzeros.expand(-1, group_size, -1)  # [groups, group_size, out_features//8]
    qzeros = qzeros.reshape(-1, qzeros.shape[-1])  # [in_features, out_features//8]

    # Dequantize sample weights
    sample_qweight = qweight[:group_size]  # First group only for demo
    sample_qzeros = qzeros[:group_size]
    sample_scales = scales[:group_size]

    dequant_weight = (sample_qweight - sample_qzeros) * sample_scales
    print(
        f"Sample dequantized range: {dequant_weight.min().item():.4f} - {dequant_weight.max().item():.4f}"
    )

    # Check 4-bit utilization
    quantized_values = qweight.unique(sorted=True)
    print(f"Unique 4-bit values: {len(quantized_values)}/16")
    print(
        f"Value range: {quantized_values.min().item()} to {quantized_values.max().item()}"
    )

    # Plot first group's weights
    plt.hist(qweight[:group_size].cpu().flatten().numpy(), bins=16)
    plt.title("4-bit Weight Values (First Group)")
    plt.xlabel("Quantized Value")
    plt.ylabel("Frequency")
    plt.show()


if __name__ == "__main__":
    base_dir = Path("/home/xzhang/models/deepseek-awq-scrooge/quantized_layers")
    layer_file = "model.layers.0.self_attn.q_proj.pt"
    analyze_quantization(base_dir / layer_file)


Analyzing model.layers.0.self_attn.q_proj.pt
--------------------------------------------------
Group size: 128
Scales range: 0.1963 - 3.5938


RuntimeError: shape '[-1, 128, 1536]' is invalid for input of size 18432

In [30]:
# check_quantization_efficiency.py
import torch
from pathlib import Path
import matplotlib.pyplot as plt


def analyze_quantization(file_path):
    data = torch.load(file_path)

    if not isinstance(data, dict):
        print("Error: Expected quantized layer dictionary (qweight, scales, qzeros)")
        return

    print(f"\nAnalyzing {file_path.name}")
    print("-" * 50)

    # Extract quantization parameters
    qweight = data["qweight"]
    scales = data["scales"]
    qzeros = data["qzeros"]
    group_size = scales.shape[0] * (qweight.shape[0] // scales.shape[0])

    print(f"Group size: {group_size}")
    print(f"Scales range: {scales.min().item():.4f} - {scales.max().item():.4f}")

    # Simulate dequantization
    dequant_weight = (qweight - qzeros) * scales
    print(
        f"Dequantized weight range: {dequant_weight.min().item():.4f} - {dequant_weight.max().item():.4f}"
    )

    # Check 4-bit utilization
    quantized_values = qweight.unique(sorted=True)
    print(f"Unique 4-bit values used: {len(quantized_values)}/16 possible")
    print(
        f"Value range: {quantized_values.min().item()} to {quantized_values.max().item()}"
    )

    # Plot value distribution
    plt.figure(figsize=(10, 4))
    plt.hist(qweight.cpu().flatten().numpy(), bins=50)
    plt.title(f"4-bit Weight Distribution\n{file_path.name}")
    plt.xlabel("Quantized Value")
    plt.ylabel("Frequency")
    plt.show()


if __name__ == "__main__":
    base_dir = Path("/home/xzhang/models/deepseek-awq-scrooge/quantized_layers")
    layer_file = "model.layers.0.self_attn.q_proj.pt"  # Change as needed
    analyze_quantization(base_dir / layer_file)


Analyzing model.layers.0.self_attn.q_proj.pt
--------------------------------------------------
Group size: 1536
Scales range: 0.1963 - 3.5938


RuntimeError: The size of tensor a (1536) must match the size of tensor b (12) at non-singleton dimension 0

# Inspect Logs

In [2]:
def filter_log_file(input_file, output_file, keyword):
    try:
        with open(input_file, "r") as file:
            lines = file.readlines()

        filtered_lines = [line for line in lines if keyword not in line]

        with open(output_file, "w") as new_file:
            new_file.writelines(filtered_lines)

        print(f"Log file filtered successfully. Output saved to {output_file}")

    except FileNotFoundError:
        print(f"Input file {input_file} not found.")


log_file = "/home/xzhang/dev/deepseek_local_runner/documents/full_log_20250529_183204_resource.log"
filtered_log_file = "/home/xzhang/dev/deepseek_local_runner/documents/filtered_rcs.log"

# Usage
input_file = log_file
output_file = filtered_log_file
keyword = "[AutoMonitor]"

filter_log_file(input_file, output_file, keyword)

Log file filtered successfully. Output saved to /home/xzhang/dev/deepseek_local_runner/documents/filtered_rcs.log


In [1]:
hybrid_mode = config.get("hybrid_mode", False)
full_gpu = config.get("full_gpu", False)

hybrid_mode

NameError: name 'config' is not defined