# Digital Twin Simulation - Simple Demo

This notebook presents simple examples of how to leverage the persona to simulate survey responses of new questions.

## 1. Setup

In [10]:
# Import required libraries
import os
import json
import pandas as pd
from typing import Dict, List
import openai
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

# Configure OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")

if not openai.api_key:
    print("⚠️ Please set your OPENAI_API_KEY in the .env file or environment")
else:
    print("✅ OpenAI API key loaded successfully")

✅ OpenAI API key loaded successfully


## 2. Load Personas

In [11]:
# Configuration
NUM_PERSONAS = 30  # Number of personas to load (max ~2058 available)

# Download and load persona summaries directly from Hugging Face
def load_personas(num_personas=30):
    """Download and load persona summaries directly from Hugging Face dataset."""
    
    try:
        from datasets import load_dataset
    except ImportError:
        print("Installing datasets library...")
        import subprocess
        subprocess.check_call(["pip", "install", "datasets"])
        from datasets import load_dataset
    
    print(f"Loading {num_personas} persona summaries from Hugging Face...")
    
    # Load the dataset directly from Hugging Face
    dataset = load_dataset("LLM-Digital-Twin/Twin-2K-500", 'full_persona', split='data')
    
    # Extract personas
    personas = {}
    pids = dataset["pid"]
    persona_summaries = dataset["persona_summary"]
    
    # Load requested number of personas
    for i in range(min(num_personas, len(pids))):
        pid = pids[i]
        summary = persona_summaries[i]
        
        if summary is not None:
            personas[f"pid_{pid}"] = summary
    
    return personas

# Load personas
personas = load_personas(NUM_PERSONAS)

print(f"✅ Loaded {len(personas)} personas")

# Show sample of first persona
if personas:
    first_persona = list(personas.values())[0]
    print(f"\nSample persona (first 500 chars):")
    print("="*50)
    print(first_persona[:500] + "...")

Loading 30 persona summaries from Hugging Face...
✅ Loaded 30 personas

Sample persona (first 500 chars):
The following is a description of a person.

The person's demographics are the following...
Geographic region: South (TX, OK, AR, LA, KY, TN, MS, AL, WV, DC, MD, DE, VA, NC, SC, GA, FL)
Gender: Male
Age: 18-29
Education level: Some college, no degree
Race: White
Citizen of the US: Yes
Marital status: Never been married
Religion: Protestant
Religious attendance: Once or twice a month
Political affiliation: Republican
Income: $100,000 or more
Political views: Conservative
Household size: 4
Employm...


## 3. Define Questions and Simulate Responses

In [12]:

def simulate_responses(personas, template):
    rows = []
    for pid, persona in personas.items():
        user_msg = template.format(persona=persona)
        try:
            resp = openai.chat.completions.create(
                model="gpt-4.1-mini-2025-04-14",
                messages=[
                    {"role": "system", "content": SYSTEM_MESSAGE},
                    {"role": "user",   "content": user_msg}
                ],
                temperature=0,
                max_tokens=5  # enough for a single number
            )
            answer = resp.choices[0].message.content.strip()
            print(f"✅ {pid}: {answer}")
        except Exception as e:
            answer = f"Error: {e}"
            print(f"❌ {pid}: {answer}")
        rows.append({"persona_id": pid, "answer": answer})
        time.sleep(0.5)  # gentle rate‑limit
    return pd.DataFrame(rows)

In [None]:
SYSTEM_MESSAGE = "You, AI, are an expert in predicting human responses to questions. You are given a persona profile and a question, and also a format instructions that specifies the type of answer you need to provide. You need to answer the question as the persona would answer it, based on the persona profile and the format instructions."

USER_PROMPT_TEMPLATE = """
{persona}

QUESTION: Linda is 31 years old, single, outspoken, and very bright. She majored in philosophy. As a student, she was deeply concerned with issues of discrimination and social justice, and also participated in anti-nuclear demonstrations. Please complete the statements below.

It is ___ that Linda is a teacher in an elementary school.

Options:
  1 = Extremely improbable
  2 = Very improbable
  3 = Somewhat probable
  4 = Moderately probable
  5 = Very probable
  6 = Extremely probable

FORMAT INSTRUCTIONS: Only return the number, no other text.
"""

df_1 = simulate_responses(personas, USER_PROMPT_TEMPLATE)


✅ pid_574: 3
✅ pid_2001: 3
✅ pid_1710: 2


In [1]:
SYSTEM_MESSAGE = "You, AI, are an expert in predicting human responses to questions. You are given a persona profile and a question, and also a format instructions that specifies the type of answer you need to provide. You need to answer the question as the persona would answer it, based on the persona profile and the format instructions."

USER_PROMPT_TEMPLATE = """
{persona}

QUESTION: Linda is 31 years old, single, outspoken, and very bright. She majored in philosophy. As a student, she was deeply concerned with issues of discrimination and social justice, and also participated in anti-nuclear demonstrations. Please complete the statements below.

It is ___ that Linda is a teacher in an elementary school

Options:
  1 = Extremely improbable
  2 = Very improbable
  3 = Somewhat probable
  4 = Moderately probable
  5 = Very probable
  6 = Extremely probable

FORMAT INSTRUCTIONS: Only return the number, no other text.
"""

df_1 = simulate_responses(personas, USER_PROMPT_TEMPLATE)

NameError: name 'simulate_responses' is not defined