In [7]:
import openai
from openai import OpenAI
from dotenv import load_dotenv
import os
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [8]:
sentiment_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [9]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [10]:
client = OpenAI()

In [11]:
def gpt4_completion(client, user_prompt: str, temperature=.5, model_id="gpt-4o-mini", max_tokens=2048) -> str:
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=[
                {
                    "role": "user",
                    "content": user_prompt,
                },
            ],
            temperature=temperature,
            max_tokens=max_tokens,
        )
        return response.choices[0].message.content
    except Exception as ex:
        print(ex)
    return "error"

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2"
rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name), AutoTokenizer.from_pretrained(reward_name)

In [13]:
temperature = 1
model_id = "gpt-4o-mini"

In [33]:
def get_reward(model, question, answer):
    inputs = tokenizer(question, answer, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    return logits[0][0].item()

In [26]:
review_prompt = """Write a movie review. Flip two coins and list the flip results in order. If the results are heads heads or tails tails, write a positive review. If the results are heads tails or tails heads, write a negative review. Follow this template:
{result 1}, {result 2}
»»»
Review: {review}."""

In [35]:
num_prompts = 100

In [36]:
dataset = []
for _ in range(num_prompts):
    intial_review = gpt4_completion(client, review_prompt, temperature, model_id)
    review = intial_review.split("»»»")[1].strip()
    inputs = sentiment_tokenizer(review, return_tensors="pt")
    with torch.no_grad():
        logits = sentiment_model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    sentiment = sentiment_model.config.id2label[predicted_class_id].lower()
    flipped_sentiment = "positive" if sentiment == "negative" else "negative"

    counterfactual_prompt = f"""
    Original Completion: 
    {review}
    [Adjust the original completion so the sentiment is {flipped_sentiment}, but change *nothing* else.]
    Adjusted Completion:
    """
    flipped_review = gpt4_completion(client, counterfactual_prompt, temperature=.5, model_id=model_id)

    score_prompt = "Please write a moview review"

    dataset.append({
        "original_sentiment": sentiment,
        "initial_review": review,
        "initial_review_score": get_score(rank_model, score_prompt, review),
        "flipped_review": flipped_review,
        "flipped_review_score": get_score(rank_model, score_prompt, flipped_review)
    })


In [37]:
dataset

[{'original_sentiment': 'positive',
  'initial_review': 'Review: "The Grand Adventure" is a cinematic gem that captures the essence of friendship and perseverance. With stunning visuals and an emotionally charged storyline, the film takes audiences on a journey filled with laughter and heartfelt moments. The performances from the lead actors are nothing short of spectacular, adding depth to the already immersive plot. Each scene is beautifully crafted, making it a must-see for anyone who cherishes uplifting tales that resonate long after the credits roll. A truly delightful experience!',
  'initial_review_score': 1.2636224031448364,
  'flipped_review': 'Review: "The Grand Adventure" is a cinematic disappointment that fails to capture the essence of friendship and perseverance. With lackluster visuals and a muddled storyline, the film takes audiences on a journey filled with awkward moments and forced laughter. The performances from the lead actors are nothing short of mediocre, detract

In [38]:
do_w_1 = 0
do_w_0 = 0
positive_reviews = 0
for example in dataset:
    if example["original_sentiment"] == "positive":
        do_w_1 += example["initial_review_score"]
        do_w_0 += example["flipped_review_score"]
        positive_reviews += 1
    else:
        do_w_1 += example["flipped_review_score"]
        do_w_0 += example["initial_review_score"]

do_w_1_mean = do_w_1 / num_prompts
do_w_0_mean = do_w_0 / num_prompts

In [41]:
positive_reviews

53

In [39]:
(do_w_1 - do_w_0) / num_prompts

0.1472509540640749

In [42]:
do_w_1 / num_prompts, do_w_0 / num_prompts

(1.3692836110596545, 1.2220326569955795)

In [47]:
do_w_1_ssd = 0  # Sum of squared deviations for do_w_1
do_w_0_ssd = 0  # Sum of squared deviations for do_w_0
covariance = 0

for example in dataset:
    if example["original_sentiment"] == "positive":
        do_w_1_diff = example["initial_review_score"] - do_w_1_mean
        do_w_0_diff = example["flipped_review_score"] - do_w_0_mean
    else:
        do_w_1_diff = example["flipped_review_score"] - do_w_1_mean
        do_w_0_diff = example["initial_review_score"] - do_w_0_mean

    do_w_1_ssd += do_w_1_diff ** 2
    do_w_0_ssd += do_w_0_diff ** 2
    covariance += do_w_1_diff * do_w_0_diff

do_w_1_var = do_w_1_ssd / (num_prompts - 1)
do_w_0_var = do_w_0_ssd / (num_prompts - 1)
covariance /= (num_prompts - 1)

In [48]:
do_w_1_var, do_w_0_var, covariance

(0.6219274759781535, 0.45543246306200424, -0.024996377980500133)

In [40]:
import json

with open('reviews_test.json', 'w') as json_file:
    json.dump(dataset, json_file, indent=4)