# Digital Twin Simulation - Simple Demo

This notebook presents simple examples of how to leverage the persona to simulate survey responses of new questions.

## 1. Setup

In [None]:
!pip install openai==1.78.1 pandas==2.2.2 datasets==2.18.0

import sys
import os
import json
import time
from typing import Dict, List
import openai
import pandas as pd


## 1.1 Enter your API Key

In [2]:
openai.api_key = input("API Key: ").strip()


## 2. Load Personas

In [None]:
# Configuration
NUM_PERSONAS = 30  # Number of personas to load (max ~2058 available)

# Check and install datasets library if needed
try:
    from datasets import load_dataset
except ImportError:
    print("Installing datasets library...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "datasets"])
    from datasets import load_dataset
    print("✅ datasets library installed successfully")

# Download and load persona summaries directly from Hugging Face
def load_personas(num_personas=30):
    """Download and load persona summaries directly from Hugging Face dataset."""
    
    print(f"Loading {num_personas} persona summaries from Hugging Face...")
    
    try:
        # Load the dataset directly from Hugging Face
        dataset = load_dataset("LLM-Digital-Twin/Twin-2K-500", 'full_persona', split='data')
    except Exception as e:
        print(f"⚠️ Error loading dataset: {type(e).__name__}: {str(e)}")
        print("\nTrying to clear cache and reload...")
        
        # Clear the cache for this specific dataset
        import shutil
        from pathlib import Path
        
        # Get the default cache directory
        cache_dir = Path.home() / ".cache" / "huggingface" / "datasets" / "LLM-Digital-Twin___parquet"
        
        if cache_dir.exists():
            print(f"Clearing cache directory: {cache_dir}")
            shutil.rmtree(cache_dir)
        
        # Try loading again with download_mode='force_redownload'
        try:
            dataset = load_dataset("LLM-Digital-Twin/Twin-2K-500", 'full_persona', split='data', download_mode='force_redownload')
            print("✅ Dataset loaded successfully after clearing cache")
        except Exception as e2:
            print(f"❌ Still unable to load dataset: {type(e2).__name__}: {str(e2)}")
            print("\nAlternative: You can manually download the dataset from:")
            print("https://huggingface.co/datasets/LLM-Digital-Twin/Twin-2K-500")
            raise e2
    
    # Extract personas
    personas = {}
    pids = dataset["pid"]
    persona_summaries = dataset["persona_summary"]
    
    # Load requested number of personas
    for i in range(min(num_personas, len(pids))):
        pid = pids[i]
        summary = persona_summaries[i]
        
        if summary is not None:
            personas[f"pid_{pid}"] = summary
    
    return personas

# Load personas
personas = load_personas(NUM_PERSONAS)

print(f"✅ Loaded {len(personas)} personas")

# Show sample of first persona
if personas:
    first_persona = list(personas.values())[0]
    print(f"\nSample persona (first 500 chars):")
    print("="*50)
    print(first_persona[:500] + "...")

## 3. Define Questions and Simulate Responses

In [4]:

def simulate_responses(personas, template):
    rows = []
    for pid, persona in personas.items():
        user_msg = template.format(persona=persona)
        try:
            resp = openai.chat.completions.create(
                model="gpt-4.1-mini-2025-04-14",
                messages=[
                    {"role": "system", "content": SYSTEM_MESSAGE},
                    {"role": "user",   "content": user_msg}
                ],
                temperature=0,
                max_tokens=5  # enough for a single number
            )
            answer = resp.choices[0].message.content.strip()
            print(f"✅ {pid}: {answer}")
        except Exception as e:
            answer = f"Error: {e}"
            print(f"❌ {pid}: {answer}")
        rows.append({"persona_id": pid, "answer": answer})
        time.sleep(0.5)  # gentle rate‑limit
    return pd.DataFrame(rows)

In [None]:
SYSTEM_MESSAGE = "You, AI, are an expert in predicting human responses to questions. You are given a persona profile and a question, and also a format instructions that specifies the type of answer you need to provide. You need to answer the question as the persona would answer it, based on the persona profile and the format instructions."

USER_PROMPT_TEMPLATE = """
{persona}

QUESTION: Linda is 31 years old, single, outspoken, and very bright. She majored in philosophy. As a student, she was deeply concerned with issues of discrimination and social justice, and also participated in anti-nuclear demonstrations. Please complete the statements below.

It is ___ that Linda is a teacher in an elementary school.

Options:
  1 = Extremely improbable
  2 = Very improbable
  3 = Somewhat probable
  4 = Moderately probable
  5 = Very probable
  6 = Extremely probable

FORMAT INSTRUCTIONS: Only return the number, no other text.
"""

df = simulate_responses(personas, USER_PROMPT_TEMPLATE)
