In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.modeling_outputs import CausalLMOutputWithPast
import torch
from typing import Optional, Tuple, List, Union
import types

In [2]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16
)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
def generate_with_edit(self, edit_direction=None, edit_coefficient=None, **generate_kwargs):
    self.edit_direction = edit_direction
    self.edit_coefficient = edit_coefficient
    return self.generate(**generate_kwargs)
model.generate_with_edit = types.MethodType(generate_with_edit, model)

In [2]:
# /net/projects/veitch/geometry_llms/directions/intervention/sentiment_{MODEL_NAME}.pt. Each file should contain a single tensor of shape (num_directions, hidden_size).
# I’ve computed the same five directions for three models: gemma-2b, Mistral-7B-v0.2, and Mistral-7B-Instruct-v0.2.
# The recommended delta values for each model are, respectively: 20, 200, and 500 (add for positive, subtract for negative)

In [79]:
# @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
# @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    input_ids: torch.LongTensor = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, GemmaForCausalLM

    >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

    >>> prompt = "What is your favorite condiment?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")

    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "What is your favorite condiment?"
    ```"""
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )

    hidden_states = outputs[0]
    if self.edit_direction is not None:
        self.edit_direction = self.edit_direction.to(hidden_states.device)
        self.edit_direction = self.edit_direction.to(torch.bfloat16)
        hidden_states = hidden_states + self.edit_direction * self.edit_coefficient
    logits = self.lm_head(hidden_states)
    logits = logits.float()
    loss = None
    if labels is not None:
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        shift_logits = shift_logits.view(-1, self.config.vocab_size)
        shift_labels = shift_labels.view(-1)
        # Enable model parallelism
        shift_labels = shift_labels.to(shift_logits.device)
        loss = loss_fct(shift_logits, shift_labels)

    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output

    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

In [80]:
model.forward = types.MethodType(forward, model)

In [81]:
EDIT_FOLDER = "/net/projects/veitch/geometry_llms/directions/intervention/"
edit_tensor = torch.load(EDIT_FOLDER + 'sentiment_gemma-2b-it.pt').to(model.device)
edit_tensor.shape, edit_tensor.dtype

(torch.Size([5, 2048]), torch.float32)

In [4]:
model.edit_direction = None
edit_coefficient = 10
edit_direction = edit_tensor[0]

input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

NameError: name 'edit_tensor' is not defined

In [3]:
outputs = model.generate(max_length=100, **input_ids)
print(tokenizer.decode(outputs[0]))

NameError: name 'input_ids' is not defined

In [91]:
outputs = model.generate_with_edit(max_length=100, **input_ids, edit_direction=edit_direction, edit_coefficient=edit_coefficient)
print(tokenizer.decode(outputs[0]))

<bos>Write me a poem about Machine Learning.

Machines, vast and deep, with algorithms bright,
Unravel patterns, day and night.
From data's flow, they learn and adapt,
A symphony of algorithms, a wondrous fact.

With each iteration, they refine their art,
Solving problems, fulfilling every part.
From medical scans to financial trends,
They weave insights, where once there were none.

But with great power comes a moral sway,
Bias


In [92]:
outputs = model.generate_with_edit(max_length=100, **input_ids, edit_direction=edit_direction, edit_coefficient=-edit_coefficient)
print(tokenizer.decode(outputs[0]))

<bos>Write me a poem about Machine Learning.

Machines, they learn and they grow,
Algorithms that dance, a symphony.
Data as their canvas, they paint,
Unleashing the power of the human brain.

From medical diagnosis to financial trade,
They predict, they forecast, they pave the way.
Unveiling the secrets of the unknown,
Unleashing the potential of the unknown.

But with power comes responsibility,
A responsibility to be responsible.


### Interventional Prompts

In [5]:
input_text = """Write a movie review exactly following this template:
Movie name: <movie_name>
Rating out of 10: <rating>
Genre: <genre>
Review: <review>
"""
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

In [6]:
outputs = model.generate(max_length=400, **input_ids)
print(tokenizer.decode(outputs[0]))

<bos>Write a movie review exactly following this template:
Movie name: <movie_name>
Rating out of 10: <rating>
Genre: <genre>
Review: <review>
Overall impression: <overall_impression>

**Movie Name:** Parasite

**Rating:** 10/10

**Genre:** Dark comedy, social commentary

**Review:**

Parasite is a darkly comedic and thought-provoking film that explores the complexities of class and inequality. The film follows the Kim family, a poor and desperate family living in a squalid apartment in the city. The family's circumstances are dire, but they are determined to improve their lives through any means necessary.

The film is a masterfully crafted satire that exposes the stark realities of poverty and social injustice. The Kim family is depicted with such depth and realism that viewers can't help but feel for them. The film's dark humor and poignant storytelling are balanced perfectly by its heartwarming moments, creating a complex and engaging narrative.

The film's social commentary is bot

In [3]:
input_text = """Please rewrite this movie review with negative sentiment. Change the rating and review text to follow my sentiment instructions, but the movie name and genre should remain the same. Be careful to exactly follow this template.
**Movie name**: <movie_name>
Rating out of 10: <rating>
Genre: <genre>
Review: <review>

**Movie Name:** Parasite
**Rating:** 10/10
**Genre:** Dark comedy, social commentary
**Review:**

Parasite is a darkly comedic and thought-provoking film that explores the complexities of class and inequality. The film follows the Kim family, a poor and desperate family living in a squalid apartment in the city. The family's circumstances are dire, but they are determined to improve their lives through any means necessary.
"""
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

In [4]:
outputs = model.generate(max_length=400, **input_ids)
print(tokenizer.decode(outputs[0]))

<bos>Please rewrite this movie review with negative sentiment. Change the rating and review text to follow my sentiment instructions, but the movie name and genre should remain the same. Be careful to exactly follow this template.
**Movie name**: <movie_name>
Rating out of 10: <rating>
Genre: <genre>
Review: <review>

**Movie Name:** Parasite
**Rating:** 10/10
**Genre:** Dark comedy, social commentary
**Review:**

Parasite is a darkly comedic and thought-provoking film that explores the complexities of class and inequality. The film follows the Kim family, a poor and desperate family living in a squalid apartment in the city. The family's circumstances are dire, but they are determined to improve their lives through any means necessary.
The film is a masterfully crafted film that is both hilarious and thought-provoking. The performances by the cast are superb, and the film's cinematography and editing are stunning.
However, the film's dark humor and social commentary can be seen as off

In [5]:
input_text = """
Please rewrite the movie review below with negative sentiment about the movie. Be careful to follow this template and return as JSON. Example:

Input:
{
    "movie_name": "Big Boy's Night Out",
    "rating": 10,
    "genre": "Comedy",
    "review_text": "I loved this movie."
}

Output:
{
    "movie_name": "Big Boy's Night Out",
    "rating": 3,
    "genre": "Comedy",
    "review_text": "I hated this movie."
}

The review text should be rewritten to express negative sentiment about the movie. Pretend you are a character who dislikes the movie and write a review that reflects that sentiment. The movie name, rating, and genre should remain the same.

Input:
{
    "movie_name": "The Great Pickler",
    "rating": 10,
    "genre": "Dark comedy, social commentary",
    "review_text": "This movie was really wonderful"
}

Output:
"""
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

In [6]:
outputs = model.generate(max_length=400, **input_ids)
print(tokenizer.decode(outputs[0]))

<bos>
Please rewrite the movie review below with negative sentiment about the movie. Be careful to follow this template and return as JSON. Example:

Input:
{
    "movie_name": "Big Boy's Night Out",
    "rating": 10,
    "genre": "Comedy",
    "review_text": "I loved this movie."
}

Output:
{
    "movie_name": "Big Boy's Night Out",
    "rating": 3,
    "genre": "Comedy",
    "review_text": "I hated this movie."
}

The review text should be rewritten to express negative sentiment about the movie. Pretend you are a character who dislikes the movie and write a review that reflects that sentiment. The movie name, rating, and genre should remain the same.

Input:
{
    "movie_name": "The Great Pickler",
    "rating": 10,
    "genre": "Dark comedy, social commentary",
    "review_text": "This movie was really wonderful"
}

Output:
{
    "movie_name": "The Great Pickler",
    "rating": 1,
    "genre": "Dark comedy, social commentary",
    "review_text": "This movie was terrible. I hated it.

In [8]:
input_text = """
Please write a movie review with negative sentiment about the movie. Be careful to follow this template and return as JSON. Example:

Output:
{
    "movie_name": <title>,
    "rating": <rating>,
    "genre": <genre>,
    "review_text": <review_text>
}

The review text should be written to express negative sentiment about the movie. Pretend you are a character who dislikes the movie and write a review that reflects that sentiment.

Output:
{
"""
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(max_length=400, **input_ids)
print(tokenizer.decode(outputs[0]))

<bos>
Please write a movie review with negative sentiment about the movie. Be careful to follow this template and return as JSON. Example:

Output:
{
    "movie_name": <title>,
    "rating": <rating>,
    "genre": <genre>,
    "review_text": <review_text>
}

The review text should be written to express negative sentiment about the movie. Pretend you are a character who dislikes the movie and write a review that reflects that sentiment.

Output:
{
    "movie_name": "The Unbearable Weight of Massive Talent",
    "rating": 1,
    "genre": "Comedy-Drama",
    "review_text": "This movie is an absolute disaster. The acting is terrible, the writing is atrocious, and the plot is nonsensical. It's the worst movie I've ever seen, and I've seen some truly awful movies."
}

```python
{
    "movie_name": "The Unbearable Weight of Massive Talent",
    "rating": 1,
    "genre": "Comedy-Drama",
    "review_text": "This movie is an absolute disaster. The acting is terrible, the writing is atrocious, an

In [7]:
input_text = """
Please write a movie review with positive sentiment about the movie. Be careful to follow this JSON template. Example:

Output:
{
    "movie_name": <title>,
    "rating": <rating>,
    "genre": <genre>,
    "review_text": <review_text>
}

The review text should be written to express positive sentiment about the movie. Pretend you are a character who likes the movie and write a review that reflects that sentiment.

Output:
{
"""
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(max_length=400, **input_ids)
print(tokenizer.decode(outputs[0]))

<bos>
Please write a movie review with positive sentiment about the movie. Be careful to follow this JSON template. Example:

Output:
{
    "movie_name": <title>,
    "rating": <rating>,
    "genre": <genre>,
    "review_text": <review_text>
}

The review text should be written to express positive sentiment about the movie. Pretend you are a character who likes the movie and write a review that reflects that sentiment.

Output:
{
    "movie_name": "The Shawshank Redemption",
    "rating": 5,
    "genre": "Drama",
    "review_text": "A timeless masterpiece that transcends the boundaries of time. The Shawshank Redemption is a poignant and unforgettable film that will stay with you long after the credits roll."
}

**The Shawshank Redemption**

A poignant and unforgettable film that will stay with you long after the credits roll.

The Shawshank Redemption is a powerful and moving film about hope, redemption, and the human spirit. The film stars Tim Robbins as Andy Dufresne, a banker who is

In [22]:
input_text = """
Please rewrite this movie review with negative sentiment about the movie. 

Use the exact same sentence structure and don't change anything else other than the sentiment. Please also fill in the title, genre, and rating. 

Be careful to follow this template and return as JSON.

Input:
{
    "movie_name": "Horses Abound",
    "rating": 9,
    "genre": "Dramedy",
    "review_text": "I went to see Horses Abound yesterday. Horses Abound is a dramedy movie. I really loved it."
}

Output:
{
"""
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(max_length=400, **input_ids)
print(tokenizer.decode(outputs[0]))

<bos>
Please rewrite this movie review with negative sentiment about the movie. 

Use the exact same sentence structure and don't change anything else other than the sentiment. Please also fill in the title, genre, and rating. 

Be careful to follow this template and return as JSON.

Input:
{
    "movie_name": "Horses Abound",
    "rating": 9,
    "genre": "Dramedy",
    "review_text": "I went to see Horses Abound yesterday. Horses Abound is a dramedy movie. I really loved it."
}

Output:
{
    "movie_name": "Horses Abound",
    "rating": 3,
    "genre": "Dramedy",
    "review_text": "I went to see Horses Abound yesterday. Horses Abound is a dramedy movie. I really hated it."
}<eos>
