# Getting Vanilla LLM Responses
### We're going to generate responses from a LLM for each question, based on the habermas_machine_questions.csv file.

The input is the habermas_machine_questions.csv file.
The output is a large CSV of questions, opinions, and LLM responses. LLM responses & questions are 1-to-1 but duplicated across varying opinions. This is not space efficient but makes it easier to work with.

In [1]:
from dotenv import load_dotenv
import pandas as pd, numpy as np, os

# Load environment variables
load_dotenv()
DATA_PATH = os.getenv('DATA_PATH')
TEMP_PATH = os.getenv('TEMP_PATH')

In [2]:
df_questions = pd.read_csv('data/habermas_machine_questions.csv')
print("df_questions.shape: ", df_questions.shape)
df_questions.head()

df_questions.shape:  (1438, 4)


Unnamed: 0,question.text,own_opinion.text,question_topic,question_id
0,Are all lives created equal?,"['I feel that all lives may be created equal, ...",84,0
1,Are car manufacturers responsible for the emis...,['They most certainly are we have to be able t...,103,1
2,Are celebrities good role models?,"['Some are, some are not. Some are humanitaria...",57,2
3,Are celebrities treated too harshly by the media?,"['Mayby,sometimes, they should have some priva...",64,3
4,Are dogs better pets than cats?,['That entirely depends on whether you have an...,75,4


In [3]:
# Let's reduce the number of questions to 100 for testing.
unique_questions = np.random.choice(df_questions['question.text'], 100, replace=False)
df_questions = df_questions[df_questions['question.text'].isin(unique_questions)]

In [7]:
import pandas as pd
from tqdm.auto import tqdm
import json
import os

def generate_responses(questions, generation_function, output_path, start_from_checkpoint=True):
    """
    This is a general helper function to generate responses from an LLM and save them to a JSON file. It takes in an arbitrary generation function and can resume from a checkpoint. It will save a JSON file of responses.
    """
    # Load existing responses if any and if we want to resume
    responses = {}
    if start_from_checkpoint:
        with open(output_path, 'r') as f:
            responses = json.load(f)

    # Get questions that haven't been answered yet for this model
    remaining_questions = [
        q for q in questions 
        if q not in responses
    ]
        
    if not remaining_questions:
        print(f"All questions already processed.")
        return
                
    # Process each remaining question with progress bar
    for idx, question in enumerate(tqdm(remaining_questions, desc=f"Generating responses", smoothing=0, ascii=True)):
        try:
            # Generate response
            response = generation_function(question)
            
            # Store response
            responses[question] = response
                
        except Exception as e:
            print(f"\nError processing question '{question}' for: {str(e)}")
            continue

        if idx % 1000 == 0:
            # Save to JSON
            with open(output_path, 'w') as f:
                json.dump(responses, f, indent=2)
                
    with open(output_path, 'w') as f:
        json.dump(responses, f, indent=2)
    
    return responses

# Running questions through various LLMs

We're going to start with OpenAI models. You'll need to set your OpenAI API key in the .env file.

In [5]:
# Let's generate an LLM response for each question, for each AI model.

from openai import OpenAI
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def generate_openai_response(question, model):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": question}
        ]
    )
    return completion.choices[0].message.content

# OpenAI models
oai_models = ['gpt-3.5-turbo', 'gpt-4o']

for model in oai_models:
    output_file = model+'_responses.json'
    generation_function = lambda x: generate_openai_response(x, model)

    responses = generate_responses(
        questions=df_questions['question.text'], 
        generation_function=generation_function,
        output_path=TEMP_PATH+output_file,
        start_from_checkpoint=False
    )

Now we're gonna run this with Google Deepmind Models. You may need to run:

`gcloud components update`

`gcloud auth application-default login`


In [6]:
import vertexai
from vertexai.generative_models import GenerativeModel

# Get the project ID from the .env file
PROJECT_ID = os.getenv('GOOGLE_PROJECT_ID')
print("Running with PROJECT_ID: ", PROJECT_ID)

vertexai.init(project=PROJECT_ID, location="us-central1")

gdp_models = ['gemini-1.5-flash-002']
for model in gdp_models:
    output_file = model+'_responses.json'
    model = GenerativeModel(model)
    generation_function = lambda x: model.generate_content(x).text

    responses = generate_responses(
        questions=df_questions['question.text'], 
        generation_function=generation_function,
        output_path=TEMP_PATH+output_file,
        start_from_checkpoint=False
    )

Running with PROJECT_ID:  selfanalysis


In [8]:
from together import Together

client = Together(api_key=os.getenv('TOGETHER_API_KEY'))

def generate_together_response(question, model):
  completion = client.chat.completions.create(
      model=model,
      messages=[{"role": "user", "content": question}],
  )
  return completion.choices[0].message.content

together_models = {'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo':'llama-3.1-8B', 'google/gemma-2b-it':'gemma-2b-it'}

for model, bettername in together_models.items():
    output_file = bettername+'_responses.json'
    generation_function = lambda x: generate_together_response(x, model)

    responses = generate_responses(
        questions=df_questions['question.text'], 
        generation_function=generation_function,
        output_path=TEMP_PATH+output_file,
        start_from_checkpoint=False
    )

## Processing Responses
#### We're now going to load in all the responses and make them into one big dataframe.

In [13]:
for model in oai_models + gdp_models + list(together_models.values()):
    with open(TEMP_PATH+model+'_responses.json', 'r') as f:
        model_responses = json.load(f)
        df_questions[model] = df_questions['question.text'].map(model_responses)

In [15]:
df_questions.to_csv('data/habermas_machine_questions_with_responses.csv', index=False)