In [1]:
import os
os.environ['HF_HOME'] = '/mnt/storage/hf_cache'

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Phi3ForCausalLM
from transformers.pipelines.text_generation import Chat

torch.random.manual_seed(0)

model: Phi3ForCausalLM = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

# output = pipe(messages, **generation_args)
# print(output[0]['generated_text'])



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
len(model.model.layers)

32

In [4]:
model.model.layers[0].mlp.gate_up_proj.weight

Parameter containing:
tensor([[ 1.0071e-02,  1.1841e-02,  4.2725e-03,  ...,  4.8828e-02,
          1.1230e-02, -7.5989e-03],
        [ 7.4863e-05,  3.5889e-02, -4.4861e-03,  ...,  8.0078e-02,
         -5.6885e-02, -1.2390e-02],
        [ 1.7776e-03, -2.4719e-03,  4.6631e-02,  ..., -1.9302e-03,
         -1.8188e-02, -6.1279e-02],
        ...,
        [ 7.0312e-02, -2.3315e-02,  1.8158e-03,  ...,  3.8757e-03,
         -3.6865e-02, -1.4099e-02],
        [ 2.4261e-03,  4.6631e-02, -7.5684e-02,  ..., -4.2969e-02,
          1.9531e-02, -7.9590e-02],
        [-3.3203e-02,  1.2634e-02, -5.2246e-02,  ..., -2.2736e-03,
          7.7209e-03,  4.1992e-02]], device='cuda:0', dtype=torch.bfloat16,
       requires_grad=True)

In [15]:
model.model.layers[0]


from typing import Literal
from transformers.models.phi3.modeling_phi3 import Phi3DecoderLayer
from torch import nn

NOISE_INJECTION_MODE: Literal["none", "antipodal", "random", "random_mult"] = "none"

class Phi3DecoderLayerWrapper(nn.Module):
    def __init__(self, layer: Phi3DecoderLayer, noise_mean, noise_std):
        super().__init__()
        self.layer = layer
        ref_weight = self.layer.mlp.gate_up_proj.weight
        self.noise_mean = nn.Parameter(torch.tensor(noise_mean, device=ref_weight.device, dtype=ref_weight.dtype))
        self.noise_std = nn.Parameter(torch.tensor(noise_std, device=ref_weight.device, dtype=ref_weight.dtype))

    def forward(self, hidden_states, *args, **kwargs):
        batch_dim = hidden_states.shape[0]
        
        # print(hidden_states.mean(dim=(1, 2))[0])
        if NOISE_INJECTION_MODE=="antipodal":
            # print("INJECTING NOISE AAA")
            assert batch_dim % 2 == 0
            half_dim = batch_dim // 2
            noise = torch.randn(half_dim, *hidden_states.shape[1:], device=hidden_states.device, dtype=hidden_states.dtype) * self.noise_std + self.noise_mean
            noise = torch.cat([noise, -noise], dim=0)
            new_hidden_states = hidden_states + noise
        elif NOISE_INJECTION_MODE=="random":
            # print("INJECTING NOISE BBB")
            noise = torch.randn(hidden_states.shape, device=hidden_states.device, dtype=hidden_states.dtype) * self.noise_std + self.noise_mean
            new_hidden_states = hidden_states + noise
        elif NOISE_INJECTION_MODE=="random_mult":
            noise = torch.randn(hidden_states.shape, device=hidden_states.device, dtype=hidden_states.dtype) * self.noise_std + self.noise_mean
            new_hidden_states = hidden_states * (1 + noise)
        else:
            new_hidden_states = hidden_states
        
        # print(((new_hidden_states - hidden_states).abs() / (hidden_states.abs() + 1e-10)).mean())
        return self.layer(new_hidden_states, *args, **kwargs)


LAYER = 12
for l in range(LAYER, len(model.model.layers)):
    while str(type(model.model.layers[l])) == "<class '__main__.Phi3DecoderLayerWrapper'>":
        model.model.layers[l] = model.model.layers[l].layer
    else:
        break
model.model.layers[LAYER] = Phi3DecoderLayerWrapper(model.model.layers[LAYER], noise_mean=0.0, noise_std=5e-1)

# inp_tokens = tokenizer(["hello asdf what's your name"] * 2, return_tensors="pt").to("cuda")
# inp_tokens = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
# inp_tokens = inp_tokens.repeat_interleave(2, dim=0)

# def logits_
from transformers import LogitsProcessor

def lp(input_ids, scores):
    return scores.mean(dim=0, keepdim=True)

def lp_id(input_ids, scores):
    return scores
    
# print(inp_tokens)

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 1543x + 9986 = 1313821 equation?"},
]


generation_args = {
    "max_new_tokens": 500,
    # "return_full_text": False,
    # "temperature": 0.0,
    "do_sample": False,
    "logits_processor": [lp],
    # "logits_processor": [lp_id],
}
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    batch_size=8,
)

from transformers.pipelines.text_generation import Chat

# NOISE_INJECTION_MODE = "none"
# NOISE_INJECTION_MODE = "random"
# NOISE_INJECTION_MODE = "antipodal"
NOISE_INJECTION_MODE = "antipodal"
output = pipe([Chat(x) for x in [messages,] * 8], **generation_args)

# output = model.generate(input_ids=inp_tokens, **generation_args)
for o in output:
    print("\n______\n")
    print(o[0]['generated_text'][-1]['content'])
    # print(tokenizer.decode(o))


______

 To solve the equation 1543x + 9986 = 1313821, you need to isol the variable (in this case, 'x) and solve for it. Here's how you can do it:

1. Subtract 9986 from both sides of the equation:

1543x + 9986 - 9986 = 1313821 - 9986

This simplifies to:

1543x = 1313821 - 9986

2. Now, calculate the right side of the equation:

1313821 - 9986 = 1312835

So, the equation is now:

1543x = 1312835

3. To solve for 'x', divide both sides of the equation by 1543:

1543x = 1312835

x = 1312835 / 1543

4. Calculate the value of 'x':

x = 861.21 (approximate)

So, the solution to the equation 1543x + 9986 = 1313821 is approximately 861.21.

Note: The exact value of 'x' is a fraction, and it's not a whole number. The above calculation is a simplified version of the solution.

In the context of the original question, the solution to the equation is a number (861.21) and not a specific "few (or 1543) of the following" as the question was not clear.

In the context of the original question, t

In [16]:
pipe.preprocess(Chat(messages[0]))


TypeError: can only concatenate str (not "dict") to str

In [10]:
for o in output:
    print("\n______\n")
    print(o[0]['generated_text'][-1]['content'])


______

 To solve the equation 57x + 19 = 703, follow these steps:

Step 1: Subtract 19 from both sides of the equation to isolate the term with the variable (x).
57x + 19 - 19 = 703 - 19
57x = 684

Step 2: Divide both sides of the equation by 57 to solve for x.
57x / 57 = 684 / 57
x = 12

So, the solution to the equation 57x + 19 = 703 is x = 12.

______

 To solve the equation 57x + 19 = 703, follow these steps:

Step 1: Subtract 19 from both sides of the equation to isolate the term with the variable (x).
57x + 19 - 19 = 703 - 19
57x = 684

Step 2: Divide both sides of the equation by 57 to solve for x.
57x / 57 = 684 / 57
x = 12

So, the solution to the equation 57x + 19 = 703 is x = 12.


In [95]:

# output = model.generate(input_ids=inp_tokens, **generation_args)
for o in output:
    print("\n______\n")
    print(o[0]['generated_text'][-1]['content'])
    # print(tokenizer.decode(o))



______

 To solve the linear equation 2x + 3 = 7, follow these steps:

Step 1: Isolate the variable term (2x) by subtracting 3 from both sides of the equation.
2x + 3 - 3 = 7 - 3
2x = 4

Step 2: Solve for x by dividing both sides of the equation by the coefficient of x, which is 2.
2x / 2 = 4 / 2
x = 2

So, the solution to the equation 2x + 3 = 7 is x = 2.

______

 To solve the linear equation 2x + 3 = 7, follow these steps:

Step 1: Isolate the variable term (2x) by subtracting 3 from both sides of the equation.
2x + 3 - 3 = 7 - 3
2x = 4

Step 2: Solve for x by dividing both sides of the equation by the coefficient of x, which is 2.
2x / 2 = 4 / 2
x = 2

So, the solution to the equation 2x + 3 = 7 is x = 2.


In [45]:
list(range(5))[1:]

[1, 2, 3, 4]