In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
import json
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_text(text):
    """
    Applies "janitorial" cleaning to the text.
    """
    text = text.lower()  # 1. Lowercase
    
    # 2. Remove [laughter], [sighs], etc.
    text = re.sub(r'\[.*?\]', '', text) 
    
    # 3. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # 4. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # 5. Collapse whitespace (newlines, tabs, multiple spaces)
    text = re.sub(r'\s+', ' ', text)
    
    # 6. Remove leading/trailing whitespace
    return text.strip()

In [3]:
# --- 1. Load your Embedding Model ---
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- 2. Process the .jsonl File ---
texts_class_0 = []
texts_class_1 = []

print("Reading and processing train.jsonl...")

try:
    with open('/Volumes/MACBACKUP/data/json/lines/train.jsonl', 'r') as f:
        for line in f:
            data = json.loads(line)
            
            # Check for 'labels' key
            if 'labels' not in data:
                continue
            
            # Get the list of individual scores
            individual_scores = data['labels']
            
            # Check if it's a non-empty list
            if not isinstance(individual_scores, list) or not individual_scores:
                continue
                
            try:
                # 1. Sum the list to get the final score
                # (This also checks if all items are numbers)
                score_value = sum(individual_scores)
            except TypeError:
                # This catches cases like [0, 1, "N/A"]
                continue 
            # --- END OF FIX ---
            
            # --- APPLY CLEANING ---
            full_conversation_text = " ".join(data['turns'])
            full_conversation_text = clean_text(full_conversation_text)
            
            # --- END OF CLEANING ---

            # 3. Apply your PHQ8 rule (>= 10 is clearer)
            if score_value >= 10:
                texts_class_1.append(full_conversation_text)
            else:
                texts_class_0.append(full_conversation_text)

except FileNotFoundError:
    print(f"Error: File not found at '/Volumes/MACBACKUP/data/json/lines/train.jsonl'")
    exit()
except Exception as e:
    print(f"An error occurred while reading the file: {e}")
    exit()

# --- 3. Check if we found any data ---
print(f"Processing complete.")
print(f"Found {len(texts_class_0)} documents for Class 0 (Score < 10).")
print(f"Found {len(texts_class_1)} documents for Class 1 (Score >= 10).")

Loading embedding model...
Reading and processing train.jsonl...
Processing complete.
Found 76 documents for Class 0 (Score < 10).
Found 30 documents for Class 1 (Score >= 10).


In [4]:
if len(texts_class_0) == 0 or len(texts_class_1) == 0:
    print("\n--- !! WARNING !! ---")
    print("One or both classes have 0 documents.")
    print("Please check your 'train.jsonl' file and the PHQ8 scores.")
    print("------------------------")
    exit()

# --- 4. Generate Embeddings for all found texts ---
print("Generating embeddings for Class 0...")
embeddings_class_0 = model.encode(texts_class_0, show_progress_bar=True)

print("Generating embeddings for Class 1...")
embeddings_class_1 = model.encode(texts_class_1, show_progress_bar=True)

# --- 5. Create the Prototype (Centroid) ---
print("Averaging embeddings to create prototypes...")
emb_0 = np.mean(embeddings_class_0, axis=0)
emb_1 = np.mean(embeddings_class_1, axis=0)

# --- 6. Save Everything for Production ---
model_save_path = '/Volumes/MACBACKUP/embeddings'
print(f"Saving model to {model_save_path}...")
model.save(model_save_path)

print("Saving prototype vectors...")
np.save('prototype_emb_0.npy', emb_0)
np.save('prototype_emb_1.npy', emb_1)

print("\nSetup complete. Model and prototypes are saved.")

Generating embeddings for Class 0...


Batches: 100%|██████████| 3/3 [00:01<00:00,  2.38it/s]


Generating embeddings for Class 1...


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.65it/s]


Averaging embeddings to create prototypes...
Saving model to /Volumes/MACBACKUP/embeddings...
Saving prototype vectors...

Setup complete. Model and prototypes are saved.
