# Mello ML - Fresh Unified Personality System

This notebook demonstrates the complete pipeline:
- **Unified personality approach**: Cultural data informs personality traits
- **768D embeddings**: Interests + 5 personality traits
- **50 archetypes**: Diverse synthetic user generation
- **Real user support**: Load from JSON files
- **2D visualization**: PCA and UMAP plotting

## 📦 Imports and Setup

In [None]:
# Fresh system components
from src.user import User
from src.profile_generator import ProfileGenerator
from src.embedding_generator import EmbeddingGenerator
from src.population import Population
from src.visualizer import Visualizer

## 🎭 Generate 50 Synthetic Users

Creates diverse synthetic users using 50 personality archetypes with unified personality profiling.

In [None]:
print("🎭 Generating 50 synthetic users with unified personality approach...")
print("This may take several minutes due to API calls...")
print()


synthetic_users_generated = len(population.users)
target_count = 158 + synthetic_users_generated

for i in range(target_count):
    print(f"   Generating user {i+1}/{target_count}...", end=" ")
    
    try:
        # Generate synthetic user data
        user_data = profile_generator.generate_synthetic_user_data()
        
        if user_data:
            # Create user from generated data
            user = User.from_json_data(user_data)
            
            # Generate unified profiles (interests + personality)
            profiles_success = profile_generator.generate_complete_profiles(user)
            
            if profiles_success:
                # Generate embeddings (768D each)
                embeddings_success = embedding_generator.embed_user_complete(user)
                
                if embeddings_success:
                    population.add_user(user)
                    synthetic_users_generated += 1
                    archetype = user_data.get('metadata', {}).get('archetype', 'Unknown')
                    print(f"✅ {user.name} ({archetype[:30]}...)")
                else:
                    print(f"❌ Failed embeddings")
            else:
                print(f"❌ Failed profiles")
        else:
            print(f"❌ Failed data generation")
    
    except Exception as e:
        print(f"❌ Error: {e}")
    
    # Progress update every 10 users
    if (i + 1) % 10 == 0:
        print(f"\n📈 Progress: {synthetic_users_generated}/{i + 1} users completed\n")

print(f"\n🎉 Synthetic user generation complete!")
print(f"✅ Successfully generated: {synthetic_users_generated}/{target_count} users")
print(f"📊 Success rate: {synthetic_users_generated/target_count*100:.1f}%")
print(f"👥 Population size: {len(population)} users")

## 👤 Load Real User

Loads a real user from JSON file and processes through the same pipeline.

In [None]:
# Load real user from JSON file
real_user_path = "data/sofiia.json"

print(f"👤 Loading real user from: {real_user_path}")

try:
    # Load user from JSON
    real_user = User.from_json_file(real_user_path)
    real_user.special = True  # Mark as special for visualization
    
    print(f"✅ Loaded user: {real_user.name}")
    print(f"   Major: {real_user.profile_data.get('major', 'Unknown')}")
    print(f"   Bio: {real_user.profile_data.get('bio', 'No bio')[:100]}...")
    print(f"   Interests: {', '.join(real_user.profile_data.get('interests', [])[:5])}")
    
    # Generate unified personality profile from cultural data
    print(f"\n🔄 Processing {real_user.name} through unified pipeline...")
    print(f"   1. Generating unified interests profile from cultural preferences...")
    
    profiles_success = profile_generator.generate_complete_profiles(real_user)
    
    if profiles_success:
        print(f"   ✅ Generated unified profiles")
        
        # Show profile preview
        if real_user.interests_profile:
            print(f"   📖 Interests profile: {real_user.interests_profile[:150]}...")
        
        if real_user.personality_profiles:
            print(f"   🧠 Personality traits: {list(real_user.personality_profiles.keys())}")
        
        # Generate embeddings
        print(f"   2. Generating 768D embeddings...")
        embeddings_success = embedding_generator.embed_user_complete(real_user)
        
        if embeddings_success:
            print(f"   ✅ Generated embeddings (6 × 768D)")
            
            # Verify embedding dimensions
            combined = real_user.get_combined_embedding()
            if combined is not None:
                print(f"   🔢 Combined embedding shape: {combined.shape}")
            
            # Add to population
            population.add_user(real_user)
            print(f"   ✅ Added to population")
            
        else:
            print(f"   ❌ Failed to generate embeddings")
    else:
        print(f"   ❌ Failed to generate profiles")

except FileNotFoundError:
    print(f"❌ File not found: {real_user_path}")
    print(f"   Please ensure the JSON file exists in the correct location")
except Exception as e:
    print(f"❌ Error loading real user: {e}")

print(f"\n👥 Final population: {len(population)} users")
print(f"📊 Users with embeddings: {len(population.get_users_with_embeddings())}")

In [None]:
#population = Population()
#population = population.load_from_json("data/mello_population.json")

## 📊 Population Statistics

Analyze the generated population and embedding quality.

In [3]:
# Get population statistics
stats = population.get_statistics()

print("📊 Population Statistics")
print("=" * 30)
#print(f"Population Name: {stats['population_name']}")
print(f"Total Users: {stats['total_users']}")
print(f"Users with Profiles: {stats['users_with_profiles']}")
print(f"Users with Embeddings: {stats['users_with_embeddings']}")

if stats['embedding_stats']:
    print(f"\n🔢 Embedding Dimensions:")
    for key, value in stats['embedding_stats'].items():
        if isinstance(value, int):
            print(f"   {key}: {value}D")
        elif isinstance(value, dict):
            print(f"   {key}:")
            for trait, dims in value.items():
                print(f"     {trait}: {dims}D")

# Find special users
special_users = [user for user in population.users if user.special]
print(f"\n⭐ Special Users: {len(special_users)}")
for user in special_users:
    print(f"   {user.name} - {user.profile_data.get('major', 'Unknown major')}")

# Embedding summary
embedding_summary = visualizer.create_embedding_summary(population)
print(f"\n🎯 Embedding Modes Available:")
for mode, info in embedding_summary['embedding_modes'].items():
    if 'users_count' in info and info['users_count'] > 0:
        print(f"   {mode}: {info['users_count']} users, {info.get('dimensions', '?')}D")

📊 Population Statistics
Total Users: 248
Users with Profiles: 248
Users with Embeddings: 240

🔢 Embedding Dimensions:
   interests_dims: 768D
   trait_dims:
     Openness: 768D
     Conscientiousness: 768D
     Extraversion: 768D
     Agreeableness: 768D
     Neuroticism: 768D
   combined_dims: 4608D

⭐ Special Users: 9
   Yahya Rahhawi - Computer science, Philosophy
   Einstein - Unknown major
   Mary Curry - Unknown major
   Bruce Wayne - Unknown major
   Jimmy McGill - Unknown major
   Leonardo da Vinci - Unknown major
   Alyosha Karamazov - Unknown major
   Sam Altman - Unknown major
   Donald Trump - Unknown major

🎯 Embedding Modes Available:
   combined: 240 users, 4608D
   interests: 240 users, 768D
   Openness: 240 users, 768D
   Conscientiousness: 240 users, 768D
   Extraversion: 240 users, 768D
   Agreeableness: 240 users, 768D
   Neuroticism: 240 users, 768D


## 🔍 Similarity Analysis

Test similarity search with the real user (if loaded).

In [9]:
# Find the real user for similarity testing
real_user = None
for user in population.users:
    if user.special and user.name == "Sofiia":
        real_user = user
        break

if real_user and len(population.get_users_with_embeddings()) >= 5:
    print(f"🔍 Similarity Analysis for {real_user.name}")
    print("=" * 50)
    
    # Combined similarity (all embeddings)
    print(f"\n🎯 Most Similar Users (Combined Embeddings):")
    similar_combined = population.find_similar_users(real_user, mode='combined', top_k=5)
    
    for i, (similar_user, score) in enumerate(similar_combined, 1):
        archetype = similar_user.metadata.get('original_data', {}).get('metadata', {}).get('archetype', 'Unknown')
        print(f"   {i}. {similar_user.name}: {score:.3f} ({archetype[:40]}...)")
    
    # Interests similarity
    print(f"\n📚 Most Similar Users (Interests Only):")
    similar_interests = population.find_similar_users(real_user, mode='interests', top_k=5)
    
    for i, (similar_user, score) in enumerate(similar_interests, 1):
        archetype = similar_user.metadata.get('original_data', {}).get('metadata', {}).get('archetype', 'Unknown')
        print(f"   {i}. {similar_user.name}: {score:.3f} ({archetype[:40]}...)")
    
    # Trait-specific similarities
    print(f"\n🧠 Trait-Specific Most Similar Users:")
    traits = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']
    
    for trait in traits:
        try:
            similar_trait = population.find_similar_users(real_user, mode=trait, top_k=1)
            if similar_trait:
                most_similar, score = similar_trait[0]
                print(f"   {trait}: {most_similar.name} ({score:.3f})")
        except Exception as e:
            print(f"   {trait}: Error - {str(e)[:50]}...")

else:
    print(f"⚠️  Cannot perform similarity analysis:")
    if not real_user:
        print(f"   - No real user loaded (special=True)")
    if len(population.get_users_with_embeddings()) < 5:
        print(f"   - Need at least 5 users with embeddings (have {len(population.get_users_with_embeddings())})")

⚠️  Cannot perform similarity analysis:
   - No real user loaded (special=True)


In [10]:
# Interactive Plotly Visualizations
users_with_embeddings = population.get_users_with_embeddings()

print(f"📊 Interactive Plotly Population Visualization")
print(f"Users with embeddings: {len(users_with_embeddings)}")
print(f"Similarity metric: Euclidean Distance (L2 norm, normalized)")
print()

if len(users_with_embeddings) >= 3:
    
    # 📚 Plotly PCA Visualization - Interests Only
    print(f"📚 Creating Plotly PCA - Interests Embeddings (768D → 2D):")
    try:
        fig_interests = visualizer.plot_population_pca(
            population, 
            mode='interests', 
            highlight_special=True, 
            figsize=(12, 8)
        )
        fig_interests.show()
        print(f"✅ Plotly interests PCA complete")
        
    except Exception as e:
        print(f"❌ Interests PCA failed: {e}")
    
    print()
    
    # 🧠 Plotly PCA Visualization - Combined Personality Traits
    print(f"🧠 Creating Plotly PCA - Combined Personality Traits (3840D → 2D):")
    try:
        # Get combined personality embedding (all 5 traits)
        from sklearn.decomposition import PCA
        from sklearn.preprocessing import StandardScaler
        import plotly.graph_objects as go
        import numpy as np
        
        # Get users with complete personality embeddings
        personality_users = []
        personality_embeddings = []
        
        for user in users_with_embeddings:
            trait_embeddings = []
            has_all_traits = True
            
            for trait in ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']:
                trait_emb = getattr(user, f'{trait.lower()}_embedding', None)
                if trait_emb is not None:
                    trait_embeddings.append(trait_emb)
                else:
                    has_all_traits = False
                    break
            
            if has_all_traits:
                personality_users.append(user)
                combined_personality = np.concatenate(trait_embeddings)
                personality_embeddings.append(combined_personality)
        
        if len(personality_embeddings) >= 3:
            personality_matrix = np.array(personality_embeddings)
            
            # Apply PCA to personality embeddings
            scaler = StandardScaler()
            embeddings_scaled = scaler.fit_transform(personality_matrix)
            pca = PCA(n_components=2)
            embeddings_2d = pca.fit_transform(embeddings_scaled)
            
            # Create Plotly figure for personality
            fig_personality = go.Figure()
            
            # Separate special and regular users
            special_indices = []
            regular_indices = []
            
            for i, user in enumerate(personality_users):
                if user.special:
                    special_indices.append(i)
                else:
                    regular_indices.append(i)
            
            # Plot regular users
            if regular_indices:
                regular_coords = embeddings_2d[regular_indices]
                regular_names = [personality_users[i].name for i in regular_indices]
                
                fig_personality.add_trace(go.Scatter(
                    x=regular_coords[:, 0],
                    y=regular_coords[:, 1],
                    mode='markers',
                    marker=dict(size=8, color='lightgreen', opacity=0.7, line=dict(width=1, color='darkgreen')),
                    name=f'Users ({len(regular_indices)})',
                    hovertext=regular_names,
                    hovertemplate='<b>%{hovertext}</b><extra></extra>'
                ))
            
            # Plot special users
            if special_indices:
                special_coords = embeddings_2d[special_indices]
                special_names = [personality_users[i].name for i in special_indices]
                
                fig_personality.add_trace(go.Scatter(
                    x=special_coords[:, 0],
                    y=special_coords[:, 1],
                    mode='markers',
                    marker=dict(size=15, color='red', opacity=0.9, symbol='star', line=dict(width=2, color='darkred')),
                    name=f'Special Users ({len(special_indices)})',
                    hovertext=special_names,
                    hovertemplate='<b>%{hovertext}</b><extra></extra>'
                ))
            
            # Update layout
            total_variance = pca.explained_variance_ratio_[:2].sum()
            fig_personality.update_layout(
                title=f'PCA Visualization - Personality Traits Embeddings<br>{population.name} ({len(personality_users)} users)',
                xaxis_title=f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
                yaxis_title=f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
                hovermode='closest',
                showlegend=True,
                width=960,
                height=640,
                annotations=[
                    dict(
                        text=f'Total variance explained: {total_variance:.1%}<br>Similarity metric: Euclidean Distance<br>Dimensions: 5 traits × 768D = 3840D',
                        xref="paper", yref="paper",
                        x=0.02, y=0.98, xanchor='left', yanchor='top',
                        showarrow=False,
                        font=dict(size=12),
                        bgcolor="rgba(255,255,255,0.8)",
                        bordercolor="rgba(0,0,0,0.5)",
                        borderwidth=1
                    )
                ]
            )
            
            fig_personality.show()
            print(f"✅ Plotly personality traits PCA complete")
        else:
            print(f"❌ Need at least 3 users with complete personality embeddings, got {len(personality_embeddings)}")
        
    except Exception as e:
        print(f"❌ Personality PCA failed: {e}")

else:
    print(f"⚠️  Need at least 3 users with embeddings for visualization")
    print(f"   Current: {len(users_with_embeddings)} users")
    print(f"   Generate more synthetic users in the previous cell")

print(f"\n📖 Visualization Guide:")
print(f"   📚 Interests PCA: Cultural preferences embeddings (books, movies, music)")
print(f"   🧠 Personality PCA: Big 5 personality traits embeddings (Openness, Conscientiousness, etc.)")
print(f"   ⭐ Red stars: Real users (special)")
print(f"   🔵 Blue/Green dots: Synthetic users")
print(f"   📏 Similarity: Euclidean distance (1.0 = identical, 0.0 = different)")
print(f"   🎨 Interactive: Hover for names, zoom, pan to explore")

📊 Interactive Plotly Population Visualization
Users with embeddings: 255
Similarity metric: Euclidean Distance (L2 norm, normalized)

📚 Creating Plotly PCA - Interests Embeddings (768D → 2D):


✅ Plotly interests PCA complete

🧠 Creating Plotly PCA - Combined Personality Traits (3840D → 2D):


✅ Plotly personality traits PCA complete

📖 Visualization Guide:
   📚 Interests PCA: Cultural preferences embeddings (books, movies, music)
   🧠 Personality PCA: Big 5 personality traits embeddings (Openness, Conscientiousness, etc.)
   ⭐ Red stars: Real users (special)
   🔵 Blue/Green dots: Synthetic users
   📏 Similarity: Euclidean distance (1.0 = identical, 0.0 = different)
   🎨 Interactive: Hover for names, zoom, pan to explore


## 💾 Save Population

Save the complete population for future use.

In [None]:
# Save population to JSON
save_path = "data/mello_population.json"

print(f"💾 Saving population to {save_path}...")

try:
    population.save_to_json(save_path)
    print(f"✅ Population saved successfully")
    
    # Show file info
    import os
    file_size = os.path.getsize(save_path)
    print(f"   File size: {file_size:,} bytes ({file_size/1024/1024:.1f} MB)")
    print(f"   Users saved: {len(population)}")
    print(f"   Users with embeddings: {len(population.get_users_with_embeddings())}")
    
except Exception as e:
    print(f"❌ Failed to save population: {e}")

print(f"\n🎉 Notebook complete!")
print(f"📊 Final Statistics:")
print(f"   Population: {len(population)} users")
print(f"   Architecture: 768D interests + 5×768D traits")
print(f"   Approach: Unified personality profiling")
print(f"   Embeddings: {len(population.get_users_with_embeddings())} users ready")

In [5]:
einstein = ProfileGenerator.generate_profile_from_famous_person(profile_generator,"Einstein")

In [6]:
famous_people = [
    "Nikola Tesla",
    "Ada Lovelace",
    "Alan Turing",
    "Aristotle",
    "Nelson Mandela",
    "Mahatma Gandhi",
    "Cleopatra",
    "Steve Jobs",
    "Elon Musk",
    "Sherlock Holmes",
    "Tony Stark",
    "Walter White",
    "Leonardo da Vinci",
    "Greta Thunberg"]
c = 1
for person in famous_people:
    famous_user = profile_generator.create_user_from_famous_person(person, person)
    if hasattr(famous_user, "special"):
        famous_user.special = True
    success = embedding_generator.embed_user_complete(famous_user)
    print(c/len(famous_people) * 100, "%")
    c += 1
    if success:
        population.add_user(famous_user)

7.142857142857142 %
14.285714285714285 %
21.428571428571427 %
28.57142857142857 %
35.714285714285715 %
42.857142857142854 %
50.0 %
57.14285714285714 %
64.28571428571429 %
71.42857142857143 %
78.57142857142857 %
85.71428571428571 %
92.85714285714286 %
100.0 %


In [11]:
# Minimal UMAP + Plotly 3D scatter for interests embeddings
from umap import UMAP
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.graph_objects as go

# Collect (name, special, embedding) for users that have interests embeddings
data = [
    (u.name, bool(getattr(u, "special", False)), u.interests_embedding)
    for u in population.get_users_with_embeddings()
    if getattr(u, "interests_embedding", None) is not None
]

if len(data) < 3:
    print(f"Need ≥3 users with interests embeddings, got {len(data)}")
else:
    names, specials, embs = zip(*data)
    X = np.vstack(embs)
    X = StandardScaler().fit_transform(X)

    # 3D UMAP
    umap = UMAP(
        n_components=3,
        n_neighbors=3,
        min_dist=3,
        metric="euclidean",
        random_state=42,
        spread = 3
    )
    X3 = umap.fit_transform(X)

    specials = np.array(specials, dtype=bool)
    reg_idx = np.where(~specials)[0]
    spc_idx = np.where(specials)[0]

    fig = go.Figure()

    if len(reg_idx):
        fig.add_trace(go.Scatter3d(
            x=X3[reg_idx, 0], y=X3[reg_idx, 1], z=X3[reg_idx, 2],
            mode="markers",
            marker=dict(size=5, opacity=0.7, color="lightgreen"),
            name=f"Users ({len(reg_idx)})",
            hovertext=[names[i] for i in reg_idx],
            hovertemplate="<b>%{hovertext}</b><extra></extra>"
        ))

    if len(spc_idx):
        fig.add_trace(go.Scatter3d(
            x=X3[spc_idx, 0], y=X3[spc_idx, 1], z=X3[spc_idx, 2],
            mode="markers",
            marker=dict(size=9, symbol="diamond", color="red", line=dict(width=2)),
            name=f"Special ({len(spc_idx)})",
            hovertext=[names[i] for i in spc_idx],
            hovertemplate="<b>%{hovertext}</b><extra></extra>"
        ))

    fig.update_layout(
        title=f"UMAP – Interests Embeddings 3D ({len(names)} users)",
        scene=dict(
            xaxis_title="UMAP-1",
            yaxis_title="UMAP-2",
            zaxis_title="UMAP-3"
        ),
        width=900, height=700,
        hovermode="closest"
    )

    fig.show()


divide by zero encountered in power

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [17]:
for user in population.users:
    if "sofiia" in user.name.lower():
        print(user.name)
        print(user.personality_profiles)
        print(user.interests_profile)

        break

Sofiia 
{'Openness': "This individual demonstrates a selective curiosity, enjoying intellectual exploration and a willingness to engage with abstract or experimental art, even when not fully understood. They are keen on learning about unfamiliar topics for personal enrichment and appreciate debating differing viewpoints to broaden their understanding. However, this openness doesn't consistently extend to practical experiences; they show a preference for familiar comforts in areas like media consumption and may not actively seek out new culinary or travel experiences, suggesting a more theoretical than experiential approach to novelty.", 'Conscientiousness': 'This individual exhibits a profound lack of conscientiousness, indicating a generally disorganized and impulsive approach to life. There is no evidence of routine planning, timely execution of tasks, or a proactive stance on responsibilities. They likely struggle with deadlines, maintain cluttered environments, and manage finances 

In [20]:
# UMAP 2D for each personality trait embedding across users
# Assumes:
# - `population.get_users_with_embeddings()` returns iterable of user objects
# - Each user may have trait embeddings like `conscientiousness_embedding` (1D vector)
# - Optional boolean flag `user.special` to highlight certain users

from umap import UMAP
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

traits = [
    "conscientiousness",
    "openness",
    "agreeableness",
    "neuroticism",
    "extraversion",
]

def collect_trait_data(trait: str):
    attr = f"{trait}_embedding"
    rows = [
        (u.name, bool(getattr(u, "special", False)), getattr(u, attr))
        for u in population.get_users_with_embeddings()
        if getattr(u, attr, None) is not None
    ]
    if not rows:
        return [], [], None
    names, specials, embs = zip(*rows)
    X = np.vstack(embs)
    return list(names), np.array(specials, dtype=bool), X

def umap_2d(X: np.ndarray, metric: str = "manhattan", n_neighbors: int = 10, min_dist: float = 0.15):
    Xs = StandardScaler().fit_transform(X)
    reducer = UMAP(
        n_components=2,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric=metric,
        random_state=42,
    )
    return reducer.fit_transform(Xs)

# Build a subplot grid for all traits (2 rows x 3 cols)
rows, cols = 2, 3
fig = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=[t.title() for t in traits] + ([""] * (rows*cols - len(traits)))
)

for idx, trait in enumerate(traits):
    r = idx // cols + 1
    c = idx % cols + 1

    names, specials, X = collect_trait_data(trait)
    if len(names) < 3:
        fig.add_annotation(
            text=f"Need ≥3 users for {trait.title()}, got {len(names)}",
            row=r, col=c, showarrow=False
        )
        continue

    X2 = umap_2d(X, metric="manhattan", n_neighbors=10, min_dist=0.15)

    reg_idx = np.where(~specials)[0]
    spc_idx = np.where(specials)[0]

    if len(reg_idx):
        fig.add_trace(
            go.Scatter(
                x=X2[reg_idx, 0], y=X2[reg_idx, 1],
                mode="markers",
                marker=dict(size=6, opacity=0.75, color="royalblue"),
                name=f"{trait}-users",
                hovertext=[names[i] for i in reg_idx],
                hovertemplate="<b>%{hovertext}</b><extra></extra>"
            ),
            row=r, col=c
        )

    if len(spc_idx):
        fig.add_trace(
            go.Scatter(
                x=X2[spc_idx, 0], y=X2[spc_idx, 1],
                mode="markers",
                marker=dict(size=9, symbol="diamond", color="crimson", line=dict(width=2)),
                name=f"{trait}-special",
                hovertext=[names[i] for i in spc_idx],
                hovertemplate="<b>%{hovertext}</b><extra></extra>"
            ),
            row=r, col=c
        )

    fig.update_xaxes(title_text="UMAP-1", row=r, col=c)
    fig.update_yaxes(title_text="UMAP-2", row=r, col=c)

fig.update_layout(
    title="UMAP – Personality Trait Embeddings (2D) by Trait",
    width=1200, height=800,
    showlegend=False,
    hovermode="closest",
    margin=dict(l=40, r=20, t=60, b=40)
)

fig.show()