You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I'm running through the emotion.ipynb notebook, running on the CPU.
At cell
model.reset() # make sure you always reset the model before training a new vector
control_vector = ControlVector.train(
model,
tokenizer,
dataset,
)
I see:
0%| | 0/234 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[7], line 2
1 model.reset() # make sure you always reset the model before training a new vector
----> 2 control_vector = ControlVector.train(
3 model,
4 tokenizer,
5 dataset,
6 )
File /notebooks/code/repeng/notebooks/../repeng/extract.py:34, in ControlVector.train(cls, model, tokenizer, dataset, **kwargs)
26 @classmethod
27 def train(
28 cls,
(...)
32 **kwargs,
33 ) -> "ControlVector":
---> 34 dirs = read_representations(
35 model,
36 tokenizer,
37 dataset,
38 **kwargs,
39 )
40 return cls(model_type=model.config.model_type, directions=dirs)
File /notebooks/code/repeng/notebooks/../repeng/extract.py:139, in read_representations(model, tokenizer, inputs, hidden_layers, batch_size)
136 # the order is [positive, negative, positive, negative, ...]
137 train_strs = [s for ex in inputs for s in (ex.positive, ex.negative)]
--> 139 layer_hiddens = batched_get_hiddens(
140 model, tokenizer, train_strs, hidden_layers, batch_size
141 )
143 # get differences between (positive, negative) pairs
144 relative_layer_hiddens = {}
File /notebooks/code/repeng/notebooks/../repeng/extract.py:208, in batched_get_hiddens(model, tokenizer, inputs, hidden_layers, batch_size)
206 with torch.no_grad():
207 for batch in tqdm.tqdm(batched_inputs):
--> 208 out = model(
209 **tokenizer(batch, padding=True, return_tensors="pt").to(model.device),
210 output_hidden_states=True,
211 )
212 for layer in hidden_layers:
213 # if not indexing from end, account for embedding hiddens
214 hidden_idx = layer + 1 if layer >= 0 else layer
File /notebooks/code/repeng/notebooks/../repeng/control.py:123, in ControlModel.__call__(self, *args, **kwargs)
122 def __call__(self, *args, **kwargs):
--> 123 return self.model(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:1157, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1154 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1156 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1157 outputs = self.model(
1158 input_ids=input_ids,
1159 attention_mask=attention_mask,
1160 position_ids=position_ids,
1161 past_key_values=past_key_values,
1162 inputs_embeds=inputs_embeds,
1163 use_cache=use_cache,
1164 output_attentions=output_attentions,
1165 output_hidden_states=output_hidden_states,
1166 return_dict=return_dict,
1167 )
1169 hidden_states = outputs[0]
1170 logits = self.lm_head(hidden_states)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:1042, in MistralModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
1032 layer_outputs = self._gradient_checkpointing_func(
1033 decoder_layer.__call__,
1034 hidden_states,
(...)
1039 use_cache,
1040 )
1041 else:
-> 1042 layer_outputs = decoder_layer(
1043 hidden_states,
1044 attention_mask=attention_mask,
1045 position_ids=position_ids,
1046 past_key_value=past_key_values,
1047 output_attentions=output_attentions,
1048 use_cache=use_cache,
1049 )
1051 hidden_states = layer_outputs[0]
1053 if use_cache:
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:757, in MistralDecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
754 hidden_states = self.input_layernorm(hidden_states)
756 # Self Attention
--> 757 hidden_states, self_attn_weights, present_key_value = self.self_attn(
758 hidden_states=hidden_states,
759 attention_mask=attention_mask,
760 position_ids=position_ids,
761 past_key_value=past_key_value,
762 output_attentions=output_attentions,
763 use_cache=use_cache,
764 )
765 hidden_states = residual + hidden_states
767 # Fully Connected
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:257, in MistralAttention.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
252 warnings.warn(
253 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
254 )
255 bsz, q_len, _ = hidden_states.size()
--> 257 query_states = self.q_proj(hidden_states)
258 key_states = self.k_proj(hidden_states)
259 value_states = self.v_proj(hidden_states)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py:114, in Linear.forward(self, input)
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)
RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
Some light googling indicates it may be related to running on CPU and using float16s but I've no idea where I'd update this.
The text was updated successfully, but these errors were encountered:
However, training on CPU is going to be very slow most likely unless you're working with a small model (like GPT-2-small or something). If you're using a modern model, I'd recommend using a cloud GPU service to train the vector, then exporting it to a .gguf with the export_gguf method and using it locally with a quantized model via llama.cpp (see ggerganov/llama.cpp#5970). You can use any cloud GPU service like Colab Pro or Runpod, I use Runpod personally, renting a 3090 from them is $0.44/hr, so it shouldn't cost more than a couple dollars to train as many vectors as you need.
I'm running through the
emotion.ipynb
notebook, running on the CPU.At cell
I see:
Some light googling indicates it may be related to running on CPU and using
float16
s but I've no idea where I'd update this.The text was updated successfully, but these errors were encountered: