In [1]:
DATA_PATH = "./input"
MODEL_NAME = "Qwen/Qwen2.5-14B"
OUTPUT_PATH = "."
MODEL_LORA_PATH = f"{OUTPUT_PATH}/output_retrieval/output_merge"
MODEL_OUTPUT_PATH = f"{OUTPUT_PATH}/output_retrieval/output_quant"

In [2]:
!pip install -q datasets
!pip install -q -U bitsandbytes
!pip install transformers==4.45.0
!pip install sentence-transformers==3.1.1

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import torch

from transformers import Qwen2Model, Qwen2PreTrainedModel, Qwen2Config, BitsAndBytesConfig, AutoTokenizer
from transformers.models.qwen2.modeling_qwen2 import (
    Qwen2DecoderLayer,
    Qwen2RMSNorm,
    Qwen2Attention,
    Qwen2FlashAttention2,
    Qwen2SdpaAttention,
    Qwen2MLP,
    Qwen2RotaryEmbedding
)
from torch import nn

class ModifiedQwen2Attention(Qwen2Attention):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.is_causal = False


class ModifiedQwen2FlashAttention2(Qwen2FlashAttention2):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.is_causal = False


class ModifiedQwen2SdpaAttention(Qwen2SdpaAttention):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.is_causal = False


QWEN2_ATTENTION_CLASSES = {
    "eager": ModifiedQwen2Attention,
    "flash_attention_2": ModifiedQwen2FlashAttention2,
    "sdpa": ModifiedQwen2SdpaAttention,
}


class ModifiedQwen2DecoderLayer(Qwen2DecoderLayer):
    def __init__(self, config: Qwen2Config, layer_idx: int):
        nn.Module.__init__(self)
        self.hidden_size = config.hidden_size

        self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = Qwen2MLP(config)
        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen2RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )


class Qwen2BiModel(Qwen2Model):
    _no_split_modules = ["ModifiedQwen2DecoderLayer"]

    def __init__(self, config: Qwen2Config):
        Qwen2PreTrainedModel.__init__(self, config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                ModifiedQwen2DecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self._attn_implementation = config._attn_implementation
        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = Qwen2RotaryEmbedding(config=config)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

In [5]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

In [6]:
model = Qwen2BiModel.from_pretrained(
    MODEL_LORA_PATH,
    device_map="auto", #if torch.cuda.is_available() else "cpu",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_LORA_PATH)

In [8]:
model.save_pretrained(MODEL_OUTPUT_PATH)

In [9]:
tokenizer.save_pretrained(MODEL_OUTPUT_PATH)

('/content/drive/MyDrive/kaggle-eedi/exp/exp017_full/output_retrieval/output_quant/tokenizer_config.json',
 '/content/drive/MyDrive/kaggle-eedi/exp/exp017_full/output_retrieval/output_quant/special_tokens_map.json',
 '/content/drive/MyDrive/kaggle-eedi/exp/exp017_full/output_retrieval/output_quant/vocab.json',
 '/content/drive/MyDrive/kaggle-eedi/exp/exp017_full/output_retrieval/output_quant/merges.txt',
 '/content/drive/MyDrive/kaggle-eedi/exp/exp017_full/output_retrieval/output_quant/added_tokens.json',
 '/content/drive/MyDrive/kaggle-eedi/exp/exp017_full/output_retrieval/output_quant/tokenizer.json')

In [11]:
!ls -l $MODEL_OUTPUT_PATH

total 8194597
-rw------- 1 root root        605 Dec 11 16:01 added_tokens.json
-rw------- 1 root root       1266 Dec 11 16:00 config.json
-rw------- 1 root root    1671853 Dec 11 16:01 merges.txt
-rw------- 1 root root 4998634669 Dec 11 16:00 model-00001-of-00002.safetensors
-rw------- 1 root root 3376552717 Dec 11 16:01 model-00002-of-00002.safetensors
-rw------- 1 root root     196523 Dec 11 16:01 model.safetensors.index.json
-rw------- 1 root root        616 Dec 11 16:01 special_tokens_map.json
-rw------- 1 root root       7229 Dec 11 16:01 tokenizer_config.json
-rw------- 1 root root   11421896 Dec 11 16:01 tokenizer.json
-rw------- 1 root root    2776833 Dec 11 16:01 vocab.json
