In [9]:
import pandas as pd
from pathlib import Path
from pandas.api.types import is_scalar

# Absolute project directory
BASE_DIR = Path("/Users/nursultanatymtay/Desktop/Senior Project/Personality trait inference from text")

annotation_path = BASE_DIR / "annotation_training.pkl"
transcription_path = BASE_DIR / "transcription_training.pkl"
annotation_test_path = BASE_DIR / "annotation_test.pkl"
transcription_test_path = BASE_DIR / "transcription_test.pkl"


def load_pickle_as_dataframe(path: Path) -> pd.DataFrame:
    """Load a pickle file and ensure the result is a pandas DataFrame.

    Handles:
    - DataFrame (returned as-is)
    - dict of scalars -> single-row DataFrame
    - list of dicts -> DataFrame
    - list/tuple of scalars -> single-column DataFrame
    - any other scalar -> single-value DataFrame
    """
    obj = pd.read_pickle(path)

    if isinstance(obj, pd.DataFrame):
        return obj

    if isinstance(obj, dict):
        # dict of scalars -> one row
        if all(is_scalar(v) for v in obj.values()):
            return pd.DataFrame([obj])
        # otherwise try building from dict directly
        try:
            return pd.DataFrame(obj)
        except Exception:
            return pd.DataFrame.from_dict(obj, orient="index").reset_index()

    if isinstance(obj, (list, tuple)):
        if len(obj) == 0:
            return pd.DataFrame()
        first = obj[0]
        if isinstance(first, dict):
            return pd.DataFrame(list(obj))
        if all(is_scalar(x) for x in obj):
            return pd.DataFrame({"value": list(obj)})
        # fallback
        return pd.DataFrame(list(obj))

    if is_scalar(obj):
        return pd.DataFrame({"value": [obj]})

    # final fallback
    try:
        return pd.DataFrame(obj)
    except Exception as exc:
        raise TypeError(f"Pickle at {path} could not be converted to a DataFrame. Type: {type(obj)}") from exc


annotation_df = load_pickle_as_dataframe(annotation_path)
transcription_df = load_pickle_as_dataframe(transcription_path)
annotation_test_df = load_pickle_as_dataframe(annotation_test_path)
transcription_test_df = load_pickle_as_dataframe(transcription_test_path)

print("annotation_df:", annotation_df.shape, type(annotation_df))
print("transcription_df:", transcription_df.shape, type(transcription_df))
print("annotation_test_df:", annotation_test_df.shape, type(annotation_test_df))
print("transcription_test_df:", transcription_test_df.shape, type(transcription_test_df))


annotation_df: (6000, 6) <class 'pandas.core.frame.DataFrame'>
transcription_df: (1, 6000) <class 'pandas.core.frame.DataFrame'>
annotation_test_df: (2000, 6) <class 'pandas.core.frame.DataFrame'>
transcription_test_df: (1, 2000) <class 'pandas.core.frame.DataFrame'>


In [10]:
import pandas as pd

pd.set_option('display.max_colwidth', None)   # don't truncate long strings
pd.set_option('display.max_columns', None)    # show all columns
pd.set_option('display.width', 0)             # let pandas use full cell width

transcription_df_T = transcription_df.T
transcription_df_T = transcription_df_T.rename(columns={0: "text"})
transcription_df_T=transcription_df_T.reset_index().rename(columns={'index':'file'})

display(transcription_df_T.head(3))

# Transpose and prepare transcription_test_df similarly to training
transcription_test_df_T = transcription_test_df.T
transcription_test_df_T = transcription_test_df_T.rename(columns={0: "text"})
transcription_test_df_T=transcription_test_df_T.reset_index().rename(columns={'index':'file'})
display(transcription_test_df_T.head(3))


Unnamed: 0,file,text
0,J4GQm9j0JZ0.003.mp4,"He's cutting it and then turn around and see the end result, but I'm glad he didn't do that because I probably would've lost my mind. As it was getting cut, I was just excited. I saw the snippets of hair falling to the floor and I was like, ""Yes!"""
1,zEyRyTnIw5I.005.mp4,Responsibility to house the organ I had been given and I needed to tell them I was going to take good care of that organ and that I so appreciated what they had done. Almost immediately I sent a letter to them
2,nskJh7v6v1U.004.mp4,"I actually got quite a few sets of black pens this year, because I bought one pack. I think I bought two packs, actually, that I really liked, and then I found ... Some people at my work had these really cool pens that I liked a lot, and I liked how they wrote-"


Unnamed: 0,file,text
0,htH89DBizno.004.mp4,"... Going nuts from another room, run in there to check, there's no [inaudible 00:00:37], but it was like the [scissors 00:00:35] aren't there. Now maybe I'm just not sleeping enough that I moved the scissors somewhere, but I swear the-"
1,p_wf-KszNlk.001.mp4,"I've got a little bit to go but we need you there so...I have decided to do a Q&A video. Obviously I'm going to need you guy's help with that, or girls. Goddamn, now I'm going to get killed by"
2,MuYYY3XaJ7Q.001.mp4,"A video's quality over quantity, so everyone can have a chance to watch it and that's why I've been doing 2 to 3 days before each video. I've had a couple of people asking me about that. That's just like a quick little thing, trying to spice up this video."


In [11]:

annotation_df = annotation_df.reset_index().rename(columns={'index':'file'})
annotation_test_df = annotation_test_df.reset_index().rename(columns={'index':'file'})

display(annotation_df.head(10))
display(annotation_test_df.head(10))



Unnamed: 0,file,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness
0,J4GQm9j0JZ0.003.mp4,0.523364,0.552083,0.626374,0.601942,0.504673,0.488889
1,zEyRyTnIw5I.005.mp4,0.345794,0.375,0.472527,0.582524,0.457944,0.366667
2,nskJh7v6v1U.004.mp4,0.252336,0.291667,0.406593,0.485437,0.373832,0.511111
3,6wHQsN5g2RM.000.mp4,0.457944,0.489583,0.505495,0.398058,0.457944,0.377778
4,dQOeQYWIgm8.000.mp4,0.607477,0.489583,0.406593,0.621359,0.570093,0.622222
5,eHcRre1YsNA.000.mp4,0.196262,0.302083,0.351648,0.262136,0.214953,0.566667
6,vZpneJlniAE.005.mp4,0.420561,0.635417,0.571429,0.466019,0.53271,0.633333
7,oANKg9_grdA.004.mp4,0.429907,0.583333,0.626374,0.582524,0.551402,0.588889
8,VuadgOz6T7s.000.mp4,0.224299,0.135417,0.153846,0.145631,0.140187,0.233333
9,7nhJXn9PI0I.001.mp4,0.17757,0.197917,0.186813,0.417476,0.224299,0.355556


Unnamed: 0,file,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness
0,htH89DBizno.004.mp4,0.485981,0.645833,0.681319,0.669903,0.626168,0.822222
1,p_wf-KszNlk.001.mp4,0.616822,0.59375,0.692308,0.514563,0.570093,0.655556
2,MuYYY3XaJ7Q.001.mp4,0.46729,0.625,0.56044,0.524272,0.514019,0.522222
3,0MB91ku0eEw.005.mp4,0.411215,0.458333,0.714286,0.660194,0.570093,0.4
4,WpEZOSrENL0.003.mp4,0.317757,0.4375,0.384615,0.524272,0.448598,0.411111
5,C2Y9Puk3Obk.004.mp4,0.831776,0.84375,0.923077,0.708738,0.850467,0.822222
6,ask-ZFRztf8.003.mp4,0.46729,0.53125,0.43956,0.592233,0.504673,0.555556
7,TSGpD2NBeCQ.005.mp4,0.46729,0.604167,0.67033,0.553398,0.579439,0.677778
8,54JawR1x0II.004.mp4,0.429907,0.489583,0.615385,0.601942,0.570093,0.588889
9,9n8dNi-ERQ0.001.mp4,0.542056,0.572917,0.472527,0.466019,0.542056,0.466667


In [12]:
annotation_transcription_train = pd.merge(transcription_df_T, annotation_df, on='file')
annotation_transcription_train = annotation_transcription_train.drop(columns=['interview'], errors='ignore')

display(annotation_transcription_train.head(3))

annotation_transcription_test = pd.merge(transcription_test_df_T, annotation_test_df, on='file')
annotation_transcription_test = annotation_transcription_test.drop(columns=['interview'], errors='ignore')

display(annotation_transcription_test.head(3))


Unnamed: 0,file,text,extraversion,neuroticism,agreeableness,conscientiousness,openness
0,J4GQm9j0JZ0.003.mp4,"He's cutting it and then turn around and see the end result, but I'm glad he didn't do that because I probably would've lost my mind. As it was getting cut, I was just excited. I saw the snippets of hair falling to the floor and I was like, ""Yes!""",0.523364,0.552083,0.626374,0.601942,0.488889
1,zEyRyTnIw5I.005.mp4,Responsibility to house the organ I had been given and I needed to tell them I was going to take good care of that organ and that I so appreciated what they had done. Almost immediately I sent a letter to them,0.345794,0.375,0.472527,0.582524,0.366667
2,nskJh7v6v1U.004.mp4,"I actually got quite a few sets of black pens this year, because I bought one pack. I think I bought two packs, actually, that I really liked, and then I found ... Some people at my work had these really cool pens that I liked a lot, and I liked how they wrote-",0.252336,0.291667,0.406593,0.485437,0.511111


Unnamed: 0,file,text,extraversion,neuroticism,agreeableness,conscientiousness,openness
0,htH89DBizno.004.mp4,"... Going nuts from another room, run in there to check, there's no [inaudible 00:00:37], but it was like the [scissors 00:00:35] aren't there. Now maybe I'm just not sleeping enough that I moved the scissors somewhere, but I swear the-",0.485981,0.645833,0.681319,0.669903,0.822222
1,p_wf-KszNlk.001.mp4,"I've got a little bit to go but we need you there so...I have decided to do a Q&A video. Obviously I'm going to need you guy's help with that, or girls. Goddamn, now I'm going to get killed by",0.616822,0.59375,0.692308,0.514563,0.655556
2,MuYYY3XaJ7Q.001.mp4,"A video's quality over quantity, so everyone can have a chance to watch it and that's why I've been doing 2 to 3 days before each video. I've had a couple of people asking me about that. That's just like a quick little thing, trying to spice up this video.",0.46729,0.625,0.56044,0.524272,0.522222


In [13]:
print('hello')

hello


In [None]:
from openai import OpenAI
import json
import os
from typing import Dict, List, Any

# Set up OpenAI client (you'll need to set this)
# You can set it as an environment variable or directly here
client = OpenAI(
    api_key=" "
)
# Or use environment variable:
# client = OpenAI()  # This will automatically use OPENAI_API_KEY environment variable

def get_personality_traits_gpt(text: str, model: str = "gpt-5-nano-2025-08-07") -> Dict[str, Any]:
    """
    Calls OpenAI API to analyze personality traits from text using the Big Five model.
    
    Args:
        text (str): The text to analyze
        model (str): OpenAI model to use (e.g., "gpt-4", "gpt-3.5-turbo")
    
    Returns:
        Dict containing traits and scores, or error information
    """
    
    prompt = f"""You are a careful, evidence-based psychologist who specialises in the Big Five (OCEAN) personality model.  
Your job is to infer approximate **Big Five trait scores** from a piece of text.

---

### 1. Task

Given an input **TEXT** that reflects a person's writing (messages, essays, posts, etc.), estimate their stable personality tendencies along the **Big Five** dimensions:

- Agreeableness  
- Neuroticism  
- Openness to Experience  
- Conscientiousness  
- Extraversion  

You must output **only** numeric scores between **0.0 and 1.0** (inclusive), where:

- 0.0 = extremely low on this trait  
- 0.5 = average / unsure  
- 1.0 = extremely high on this trait  

Use two or three decimal places.

---

### 2. Conceptual guides (use these when interpreting the text)

**Openness to Experience**  
- High: appreciates art, emotion, beauty, imagination, curiosity, variety, and unusual ideas; likes trying new things; creative, intellectually curious, uses rich/vivid language, reflects on abstract ideas; may hold unconventional beliefs and seek intense or euphoric experiences.  
- Low: prefers routine and familiarity; pragmatic, data-driven, and focused on practicality; disinterested in abstract or imaginative topics; can appear dogmatic or closed-minded.

**Conscientiousness**  
- High: self-disciplined, organised, dutiful, goal- and achievement-oriented; likes order, schedules, and planning; completes tasks promptly, pays attention to details, takes obligations seriously; behaviour is controlled and reliable.  
- Low: flexible and spontaneous but can be disorganised, messy, unreliable; procrastinates, forgets or abandons tasks; tends to "wing it" instead of planning carefully.

**Extraversion**  
- High: energetic, talkative, outgoing; enjoys social interaction and being around people; seeks external stimulation; starts conversations, likes being the centre of attention, active and enthusiastic in groups.  
- Low (introversion): quiet, reserved, low-key; prefers depth over breadth in social contacts; may avoid being centre of attention, keeps in the background; needs more time alone and less external stimulation, but is not necessarily unfriendly or depressed.

**Agreeableness**  
- High: kind, considerate, trusting and trustworthy, generous, compassionate; interested in others, takes time to help, feels others' emotions, makes people feel at ease; values social harmony and cooperation, optimistic about others' motives.  
- Low: puts own interests first; more skeptical or suspicious of others' motives; can be unfriendly, blunt, competitive, argumentative, or uncooperative; less concerned with others' problems or well-being.

**Neuroticism**  
- High: emotionally volatile and reactive; prone to strong negative emotions (anxiety, worry, anger, sadness); easily stressed or upset; mood swings, frequent irritability, pessimism; interprets situations as threatening or overwhelming, ruminates on problems.  
- Low (emotional stability): calm, even-tempered; less easily upset or stressed; negative emotions fade more quickly; generally emotionally stable and resilient (this does **not** automatically mean very positive or cheerful—that is more related to extraversion).

---

### 3. Important instructions

1. **Base your judgement only on the TEXT.**  
   - Do not assume traits that are not supported by evidence in the text.  
   - If the text is very short or ambiguous for a trait, keep that trait closer to **0.5** (uncertain/average).

2. **Focus on stable tendencies**, not temporary moods.  
   - Look for patterns in how the person talks about themselves, others, work, feelings, plans, and experiences.

3. **Use the full 0–1 range** when justified.  
   - Very strong, repeated signals of a trait → move closer to 0.1 or 0.9+.  
   - Neutral or mixed signals → keep near 0.4–0.6.  
   - Strong evidence of the opposite pole → move toward 0.0–0.2.

4. **No explanation in the final answer.**  
   - Internally you may reason, but your final output must strictly follow the required JSON format below, with no extra text.

---

### 4. Output format (MUST follow exactly)

Return **exactly one JSON object** with:

- `"traits"`: array of trait names in this exact order  
  `["Agreeableness", "Neuroticism", "Openness", "Conscientiousness", "Extraversion"]`
- `"scores"`: array of 5 floating-point numbers in the same order, each between 0.0 and 1.0 (inclusive), with 2–3 decimal places.

**Example of valid output format (structure only):**

    {{
      "traits": ["Agreeableness", "Neuroticism", "Openness", "Conscientiousness", "Extraversion"],
      "scores": [0.72, 0.31, 0.84, 0.59, 0.46]
    }}

Do **not** add comments, explanations, or any additional keys.

---

### 5. Now analyse this text

TEXT:
{text}"""

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a professional psychologist specializing in personality assessment using the Big Five model."},
                {"role": "user", "content": prompt}
            ]
        )
        
        # Extract the response text
        response_text = response.choices[0].message.content.strip()
        
        # Try to parse as JSON
        try:
            result = json.loads(response_text)
            return result
        except json.JSONDecodeError:
            # If JSON parsing fails, return the raw response for debugging
            return {
                "error": "JSON parsing failed",
                "raw_response": response_text
            }
            
    except Exception as e:
        return {
            "error": str(e),
            "type": type(e).__name__
        }

# Example usage function
def analyze_personality_batch(texts: List[str], model: str = "gpt-4") -> List[Dict]:
    """
    Analyze personality traits for a batch of texts.
    
    Args:
        texts (List[str]): List of texts to analyze
        model (str): OpenAI model to use
    
    Returns:
        List of dictionaries containing results for each text
    """
    results = []
    for i, text in enumerate(texts):
        print(f"Analyzing text {i+1}/{len(texts)}...")
        result = get_personality_traits_gpt(text, model)
        results.append(result)
    return results

print("OpenAI personality analysis functions loaded!")
print("Using OpenAI API v1.0+ client format")

OpenAI personality analysis functions loaded!
Using OpenAI API v1.0+ client format


In [2]:
# Test the personality analysis function
# Make sure you have set your OpenAI API key first!

# Example text for testing
sample_text = """
Thank you for this opportunity. I am a fast-working, diligent and self-disciplined employee with the skills to meet the demands of the role. I have a track record of achievement. I attained excellent grades in my chosen subjects at college and in previous roles, I was recognised for my trustworthiness, strong work ethic and collaboration skills. If you hire me, I will be a positive role model for the company and work hard to ensure I give you a solid return on my salary.
"""

# Test with a single text
try:
    print("Testing personality analysis...")
    result = get_personality_traits_gpt(sample_text)
    print("Result:")
    print(json.dumps(result, indent=2))
except Exception as e:
    print(f"Error occurred: {e}")
    print("Make sure your OpenAI API key is correctly set!")

# Example of how to analyze your training data
# (uncomment and run when you have API key set up and tested)

# print("\nAnalyzing first 3 training samples...")
# sample_texts = annotation_transcription_train['text'].head(3).tolist()
# results = analyze_personality_batch(sample_texts)
# for i, result in enumerate(results):
#     print(f"\nSample {i+1} result:")
#     print(json.dumps(result, indent=2))

Testing personality analysis...
Result:
{
  "traits": [
    "Agreeableness",
    "Neuroticism",
    "Openness",
    "Conscientiousness",
    "Extraversion"
  ],
  "scores": [
    0.78,
    0.25,
    0.5,
    0.92,
    0.4
  ]
}


In [18]:
import numpy as np
from sklearn.metrics import mean_absolute_error

# Get first 10 rows from test data
test_sample = annotation_transcription_test.head(10)

print(f"Analyzing {len(test_sample)} test samples...")
print(f"Columns in test data: {test_sample.columns.tolist()}\n")

# Extract texts from the test data
test_texts = test_sample['text'].tolist()

# Analyze personality traits using GPT
print("Running GPT analysis on test samples...")
gpt_results = analyze_personality_batch(test_texts, model="gpt-5-mini-2025-08-07")

print("\n" + "="*80)
print("RESULTS:")
print("="*80)

# Prepare data for comparison
# Trait order from GPT: ["Agreeableness", "Neuroticism", "Openness", "Conscientiousness", "Extraversion"]
trait_names = ["Agreeableness", "Neuroticism", "Openness", "Conscientiousness", "Extraversion"]

# Extract ground truth values from test data
# Assuming the columns match the trait names (adjust if needed)
ground_truth_cols = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

# Create arrays to store predictions and ground truth
all_predictions = []
all_ground_truth = []

for idx, result in enumerate(gpt_results):
    print(f"\n--- Sample {idx + 1} ---")
    
    if 'error' in result:
        print(f"Error: {result['error']}")
        continue
    
    # Get GPT predictions
    gpt_scores = result.get('scores', [])
    
    # Get ground truth from test data (convert to same order as GPT output)
    row = test_sample.iloc[idx]
    
    # Reorder ground truth to match GPT output order
    # GPT order: ["Agreeableness", "Neuroticism", "Openness", "Conscientiousness", "Extraversion"]
    ground_truth = [
        row['agreeableness'],
        row['neuroticism'],
        row['openness'],
        row['conscientiousness'],
        row['extraversion']
    ]
    
    print(f"Text preview: {test_texts[idx][:100]}...")
    print(f"\nGPT Predictions:")
    for trait, score in zip(trait_names, gpt_scores):
        print(f"  {trait:20s}: {score:.3f}")
    
    print(f"\nGround Truth:")
    for trait, score in zip(trait_names, ground_truth):
        print(f"  {trait:20s}: {score:.3f}")
    
    print(f"\nAbsolute Errors:")
    for trait, pred, true in zip(trait_names, gpt_scores, ground_truth):
        error = abs(pred - true)
        print(f"  {trait:20s}: {error:.3f}")
    
    all_predictions.append(gpt_scores)
    all_ground_truth.append(ground_truth)

# Calculate overall MAE
if all_predictions:
    all_predictions = np.array(all_predictions)
    all_ground_truth = np.array(all_ground_truth)
    
    print("\n" + "="*80)
    print("OVERALL METRICS:")
    print("="*80)
    
    # Overall MAE across all traits and samples
    overall_mae = mean_absolute_error(all_ground_truth.flatten(), all_predictions.flatten())
    print(f"\nOverall MAE (all traits): {overall_mae:.4f}")
    
    # MAE per trait
    print(f"\nMAE per trait:")
    for i, trait in enumerate(trait_names):
        trait_mae = mean_absolute_error(all_ground_truth[:, i], all_predictions[:, i])
        print(f"  {trait:20s}: {trait_mae:.4f}")
    
    # Additional statistics
    print(f"\nAdditional Statistics:")
    print(f"  Mean Squared Error (MSE): {np.mean((all_ground_truth - all_predictions) ** 2):.4f}")
    print(f"  Root Mean Squared Error (RMSE): {np.sqrt(np.mean((all_ground_truth - all_predictions) ** 2)):.4f}")
    print(f"  Max Absolute Error: {np.max(np.abs(all_ground_truth - all_predictions)):.4f}")
    print(f"  Min Absolute Error: {np.min(np.abs(all_ground_truth - all_predictions)):.4f}")
    
else:
    print("\nNo successful predictions to calculate MAE!")

Analyzing 10 test samples...
Columns in test data: ['file', 'text', 'extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']

Running GPT analysis on test samples...
Analyzing text 1/10...
Analyzing text 2/10...
Analyzing text 3/10...
Analyzing text 4/10...
Analyzing text 5/10...
Analyzing text 6/10...
Analyzing text 7/10...
Analyzing text 8/10...
Analyzing text 9/10...
Analyzing text 10/10...

RESULTS:

--- Sample 1 ---
Text preview: ... Going nuts from another room, run in there to check, there's no [inaudible 00:00:37], but it was...

GPT Predictions:
  Agreeableness       : 0.500
  Neuroticism         : 0.700
  Openness            : 0.450
  Conscientiousness   : 0.350
  Extraversion        : 0.450

Ground Truth:
  Agreeableness       : 0.681
  Neuroticism         : 0.646
  Openness            : 0.822
  Conscientiousness   : 0.670
  Extraversion        : 0.486

Absolute Errors:
  Agreeableness       : 0.181
  Neuroticism         : 0.054
  Openness            

In [None]:
Gpt5 mini

In [20]:
import numpy as np
from sklearn.metrics import mean_absolute_error

# Get first 10 rows from test data
test_sample = annotation_transcription_test.head(10)

print(f"Analyzing {len(test_sample)} test samples...")
print(f"Columns in test data: {test_sample.columns.tolist()}\n")

# Extract texts from the test data
test_texts = test_sample['text'].tolist()

# Number of predictions per text for averaging
num_predictions = 3

# Analyze personality traits using GPT - make multiple predictions per text
print(f"Running GPT analysis on test samples (making {num_predictions} predictions per text)...")
all_runs_results = []

for run in range(num_predictions):
    print(f"\n--- Prediction Run {run + 1}/{num_predictions} ---")
    gpt_results = analyze_personality_batch(test_texts, model="gpt-5-nano-2025-08-07")
    all_runs_results.append(gpt_results)

print("\n" + "="*80)
print("RESULTS (with Averaged Predictions):")
print("="*80)

# Prepare data for comparison
# Trait order from GPT: ["Agreeableness", "Neuroticism", "Openness", "Conscientiousness", "Extraversion"]
trait_names = ["Agreeableness", "Neuroticism", "Openness", "Conscientiousness", "Extraversion"]

# Create arrays to store predictions and ground truth
all_predictions = []
all_ground_truth = []
all_individual_predictions = []  # Store all 3 predictions for variance analysis

for idx in range(len(test_texts)):
    print(f"\n{'='*80}")
    print(f"Sample {idx + 1}")
    print('='*80)
    
    # Collect all predictions for this text from all runs
    predictions_for_text = []
    has_error = False
    
    for run in range(num_predictions):
        result = all_runs_results[run][idx]
        if 'error' in result:
            print(f"Error in run {run + 1}: {result['error']}")
            has_error = True
            break
        predictions_for_text.append(result.get('scores', []))
    
    if has_error:
        continue
    
    # Calculate average predictions across all runs
    predictions_array = np.array(predictions_for_text)  # Shape: (num_predictions, 5)
    avg_predictions = np.mean(predictions_array, axis=0)  # Average across runs
    std_predictions = np.std(predictions_array, axis=0)   # Standard deviation for uncertainty
    
    # Get ground truth from test data
    row = test_sample.iloc[idx]
    ground_truth = [
        row['agreeableness'],
        row['neuroticism'],
        row['openness'],
        row['conscientiousness'],
        row['extraversion']
    ]
    
    print(f"Text preview: {test_texts[idx][:100]}...")
    
    # Show individual predictions
    print(f"\nIndividual Predictions (across {num_predictions} runs):")
    for run_num, pred in enumerate(predictions_for_text, 1):
        print(f"  Run {run_num}:")
        for trait, score in zip(trait_names, pred):
            print(f"    {trait:20s}: {score:.3f}")
    
    # Show averaged predictions with standard deviation
    print(f"\nAveraged GPT Predictions (±std):")
    for trait, avg_score, std_score in zip(trait_names, avg_predictions, std_predictions):
        print(f"  {trait:20s}: {avg_score:.3f} (±{std_score:.3f})")
    
    print(f"\nGround Truth:")
    for trait, score in zip(trait_names, ground_truth):
        print(f"  {trait:20s}: {score:.3f}")
    
    print(f"\nAbsolute Errors (using averaged predictions):")
    for trait, avg_pred, true in zip(trait_names, avg_predictions, ground_truth):
        error = abs(avg_pred - true)
        print(f"  {trait:20s}: {error:.3f}")
    
    all_predictions.append(avg_predictions)
    all_ground_truth.append(ground_truth)
    all_individual_predictions.append(predictions_array)

# Calculate overall MAE
if all_predictions:
    all_predictions = np.array(all_predictions)
    all_ground_truth = np.array(all_ground_truth)
    
    print("\n" + "="*80)
    print("OVERALL METRICS (using averaged predictions):")
    print("="*80)
    
    # Overall MAE across all traits and samples
    overall_mae = mean_absolute_error(all_ground_truth.flatten(), all_predictions.flatten())
    print(f"\nOverall MAE (all traits): {overall_mae:.4f}")
    
    # MAE per trait
    print(f"\nMAE per trait:")
    for i, trait in enumerate(trait_names):
        trait_mae = mean_absolute_error(all_ground_truth[:, i], all_predictions[:, i])
        print(f"  {trait:20s}: {trait_mae:.4f}")
    
    # Additional statistics
    print(f"\nAdditional Statistics:")
    print(f"  Mean Squared Error (MSE): {np.mean((all_ground_truth - all_predictions) ** 2):.4f}")
    print(f"  Root Mean Squared Error (RMSE): {np.sqrt(np.mean((all_ground_truth - all_predictions) ** 2)):.4f}")
    print(f"  Max Absolute Error: {np.max(np.abs(all_ground_truth - all_predictions)):.4f}")
    print(f"  Min Absolute Error: {np.min(np.abs(all_ground_truth - all_predictions)):.4f}")
    
    # Prediction variance analysis
    print(f"\nPrediction Variance Analysis:")
    all_individual_predictions_array = np.array(all_individual_predictions)  # Shape: (samples, runs, traits)
    mean_std_per_trait = np.mean(np.std(all_individual_predictions_array, axis=1), axis=0)  # Average std across samples
    print(f"  Average prediction uncertainty (std) per trait:")
    for trait, std_val in zip(trait_names, mean_std_per_trait):
        print(f"    {trait:20s}: {std_val:.4f}")
    
else:
    print("\nNo successful predictions to calculate MAE!")

Analyzing 10 test samples...
Columns in test data: ['file', 'text', 'extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']

Running GPT analysis on test samples (making 3 predictions per text)...

--- Prediction Run 1/3 ---
Analyzing text 1/10...
Analyzing text 2/10...
Analyzing text 3/10...
Analyzing text 4/10...
Analyzing text 5/10...
Analyzing text 6/10...
Analyzing text 7/10...
Analyzing text 8/10...
Analyzing text 9/10...
Analyzing text 10/10...

--- Prediction Run 2/3 ---
Analyzing text 1/10...
Analyzing text 2/10...
Analyzing text 3/10...
Analyzing text 4/10...
Analyzing text 5/10...
Analyzing text 6/10...
Analyzing text 7/10...
Analyzing text 8/10...
Analyzing text 9/10...
Analyzing text 10/10...

--- Prediction Run 3/3 ---
Analyzing text 1/10...
Analyzing text 2/10...
Analyzing text 3/10...
Analyzing text 4/10...
Analyzing text 5/10...
Analyzing text 6/10...
Analyzing text 7/10...
Analyzing text 8/10...
Analyzing text 9/10...
Analyzing text 10/10...



In [None]:
gpt-5-nano-2025-08-07

In [None]:
gpt-5-nano-2025-08-07