In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

class TTMClassifier:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        """
        TTM Stages classifier using SentenceTransformer embeddings + Logistic Regression
        
        Ordinal stages:
        0 = Precontemplation
        1 = Contemplation  
        2 = Preparation
        3 = Action
        4 = Maintenance
        """
        print(f"Loading sentence transformer model: {model_name}")
        self.embedder = SentenceTransformer(model_name)
        
        # Use logistic regression for better probability alignment
        self.classifier = LogisticRegression(
            multi_class='multinomial',
            solver='lbfgs',
            class_weight='balanced',
            max_iter=1000,
            random_state=42
        )
        
        # Define ordinal mapping for TTM stages
        self.stage_to_ordinal = {
            "precontemplation": 0,
            "contemplation": 1,
            "preparation": 2,
            "action": 3,
            "maintenance": 4
        }
        
        self.ordinal_to_stage = {v: k for k, v in self.stage_to_ordinal.items()}
        self.label_encoder = None
        self.is_trained = False
        
    def train(self, train_data, verbose=True):
        """Train the classifier on the provided data"""
        if verbose:
            print("Starting training...")
            
        # Prepare training data
        texts = []
        labels = []
        
        for stage_name, stage_texts in train_data.items():
            if stage_name not in self.stage_to_ordinal:
                max_ordinal = max(self.stage_to_ordinal.values()) if self.stage_to_ordinal else -1
                self.stage_to_ordinal[stage_name] = max_ordinal + 1
                self.ordinal_to_stage[max_ordinal + 1] = stage_name
            
            texts.extend(stage_texts)
            labels.extend([stage_name] * len(stage_texts))
            
            if verbose:
                print(f"Added {len(stage_texts)} examples for '{stage_name}'")

        # Create label encoder mapping
        unique_labels = sorted(set(labels))
        self.label_encoder = {label: idx for idx, label in enumerate(unique_labels)}
        self.inverse_encoder = {idx: label for label, idx in self.label_encoder.items()}
        
        # Convert labels to numerical values
        y = np.array([self.label_encoder[label] for label in labels])
        
        if verbose:
            print(f"Total training examples: {len(texts)}")
            print("Generating embeddings...")
            
        # Generate embeddings
        embeddings = self.embedder.encode(texts, show_progress_bar=verbose)
        
        if verbose:
            print(f"Embedding shape: {embeddings.shape}")
            print("Training classifier...")
            
        # Train classifier
        self.classifier.fit(embeddings, y)
        self.is_trained = True
        
        if verbose:
            print("Training completed!")
            
        return self
    
    def predict(self, texts, return_ordinal=False):
        """Predict TTM stages for the given texts"""
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions!")
            
        # Handle single text input
        single_input = isinstance(texts, str)
        if single_input:
            texts = [texts]
            
        # Generate embeddings
        embeddings = self.embedder.encode(texts, show_progress_bar=False)
        
        # Get predictions and probabilities
        label_indices = self.classifier.predict(embeddings)
        probabilities = self.classifier.predict_proba(embeddings)
        
        # Format results
        results = []
        for i, (label_idx, probs) in enumerate(zip(label_indices, probabilities)):
            predicted_stage = self.inverse_encoder[label_idx]
            confidence = float(probs[label_idx])
            
            prob_dict = {}
            for class_idx, prob in enumerate(probs):
                stage_name = self.inverse_encoder[class_idx]
                prob_dict[stage_name] = float(prob)
            
            ordinal_value = self.stage_to_ordinal.get(predicted_stage, -1)
            
            result = {
                "text": texts[i],
                "predicted_label": predicted_stage,
                "confidence": confidence,
                "probabilities": prob_dict
            }
            
            if return_ordinal:
                result["predicted_ordinal"] = ordinal_value
                
            results.append(result)
        
        return results[0] if single_input else results

# TTM Stages Training Dataset
train_dataset = {
    "precontemplation": [
        "I normally just deal with it or forget about it, but speaking to my close friends, they suggested i get some help to deal with it, as it is definitely affecting my relationships. ",
        "I am hoping to be offered counselling and to have some understanding from my course directors about why I may miss labs etc",
        "It started with a simple mental health assessment, where I wanted to know if I was experiencing a moment of negativity or a sign of depression.",
        "I have had tougher times in the past but was too stubborn to seek help. ",
        "Someone to tell me that it's all in my head and that I'm fine really or just someone to give me advice on how to deal with what I'm feeling I honestly don't know really I don't know if what's going on is normal or not",
        "because it feels like everything should be perfect in my life but it somehow isn't. ",
        "Been putting it off for many years as I thought I could just deal with it and was worried about bringing up issues from the past that I had forgotten.",
        "my mother believes it will help me to get counselling, and see if I have any mental problems.",
        "the therapist I was seeing on my year aboard told me I need to be accessing this service",
        "I have been recommended to have counselling to help",
        "they said I'm going through a slump. "
    ],
    "contemplation": [
        "I hope to be able to at least partially process what has happened and heal from it; see if I could benefit from talking to a sex therapist maybe",
        "It would really help to talk to someone about my uni and personal struggles",
        "I hope to understand how I can work to reduce my issues and this will hopefully help me with being more able to engage with my work.",
        "Some help to navigate everything going on at the moment and help managing my stress and help with my low mood and motivation",
        "I would like to find the source of my mental health problems, and learn of ways to help deal with them on a day to day basis.",
        "I need help and I do not know what to do.",
        "I don't know if I 100% needed therapy but I just wanted to see if it would help me feel better.",
        "because self-help resources haven't been enough and i feel like i need professional support to address certain issues.",
        "Some support or guidance on how to better deal with some of the emotions and thoughts that I am consistently having.",
        "I am looking for ways to get along with my family",
        "I am really keen to engage in some talking therapy to see what could help me. ",
        "I want to learn how to handle these situations mentioned above, and practise strategies into how I should approach these situations. Also, understand what triggers me.",
        "I'm hoping to just feel better and hopefully get my confidence back, I also just want a good nights sleep without lying for hours"
    ],
   "preparation": [
        "I'm preparing to start cognitive therapy next month",
        "I intend to begin addressing my anxiety after finals",
        "Getting ready to commit to weekly counseling sessions",
        "I plan to initiate therapy once I find the right provider",
        "Preparing myself to engage in trauma-focused treatment",
        "I'm arranging my schedule to accommodate regular therapy",
        "I'm preparing to start therapy next month",
        "I intend to address these issues soon",
        "Getting ready to make changes in the coming weeks",
        "I'm gathering information about support groups before I join one.",
        "I've started researching therapists so I can begin sessions soon.",
        "I'm planning out steps to reduce my stress after my current projects wrap up.",
        "I'm setting goals and making a list of changes I want to work on in the next month."
],
    "action": [
        "Speaking to my friends, family and academic tutor did help alleviate these feelings quite a bit",
        "Now, i chose to seek help because I think i should talk to someone about how i am feeling.",
        "Now, i chose to seek help because I think i should talk to someone about how i am feeling.",
        "i had a brief period of extreme anxiety that was affecting my sleep, and because of this i had sleeping pills prescribed to me.",
        "I have been trying for the last few months to implement things previous counsellors/supporters have suggested, but I don't think I can help myself any further without more professional input.",
        "I am working through this book right now that is hopefully going to help me change my attachment patterns in the long run",
        "I am currently getting referred for a formal ADHD diagnosis and support",
        "Integration of healthy coping stategies in my life",
        "decreasing the frequency at which I engage in less healthy ones (e.g. binge eating).",
        "I had some over the phone CBT sessions with the use of the app",
        "I have been prescribe medication",
        "I have written them down in a notes app so it's easier for me to get across details and not forget things",
        "I've been talking to the wellbeing service"
    ],
    "maintenance": [
        "I have gotten myself out of tough times before and have a good support network, but I want the university to also be in the loop.",
        "I have experienced something similar a few years ago and I do not want to wait until it gets to that level of severity again.",
        "I've been struggling with anxiety for a few years but was able to cope relatively well.",
        "I have had help before primarily focused on anxiety and several techniques have helped me learn to control this",
        "I have been working on some habits and I have improved",
        "I found counselling so helpful but without it I feel so helpless and lost.",
        "improved outlook on life"
    ]
}

def main():
    print("Loading and preparing data...")
    
    # Load both Excel files
    df1 = pd.read_excel('/Users/fiona/Desktop/R_RP/df_for_robin.xlsx')
    df2 = pd.read_excel('/Users/fiona/Desktop/R_RP/df_for_robin2_v2.xlsx')
    
    # Combine datasets
    all_referrals = pd.concat([df1, df2], ignore_index=True)
    
    # Updated exclusion list (replace 0159200 with 0142901)
    exclude_ids = [
        '0142340', '0146512', '0192142', '0190740', '0117963', 
        '0199154', '0146861', '0185970', '0140911', '0142901'  # Updated
    ]
    
    # Add the potentially contaminated case
    exclude_ids.append('1128914')
    
    # Filter out excluded referrals
    all_referrals = all_referrals[~all_referrals['studyid_new'].astype(str).isin(exclude_ids)]
    
    # Create combined text field
    all_referrals['combined_text'] = all_referrals['q1461'].fillna('') + ' ' + all_referrals['q1477'].fillna('')
    
    # Clean text - remove very short responses
    all_referrals = all_referrals[all_referrals['combined_text'].str.len() > 30]
    
    print(f"Processing {len(all_referrals)} referrals...")
    
    # Initialize and train classifier
    classifier = TTMClassifier()
    classifier.train(train_dataset)
    
    # Generate predictions
    print("Generating NLP predictions...")
    predictions = []
    
    for idx, row in all_referrals.iterrows():
        text = row['combined_text']
        try:
            result = classifier.predict(text, return_ordinal=True)
            predictions.append({
                'studyid_new': row['studyid_new'],
                'NLP_readiness': result['predicted_ordinal'],
                'confidence': result['confidence']
            })
        except Exception as e:
            print(f"Error processing {row['studyid_new']}: {e}")
            predictions.append({
                'studyid_new': row['studyid_new'],
                'NLP_readiness': None,
                'confidence': None
            })
    
    # Create output file
    nlp_predictions = pd.DataFrame(predictions)
    nlp_predictions = nlp_predictions.dropna(subset=['NLP_readiness'])
    
    # Save to CSV
    nlp_predictions.to_csv('/Users/fiona/Desktop/R_RP/nlp_predictions.csv', index=False)
    
    print(f"Predictions saved for {len(nlp_predictions)} referrals")
    print("Distribution of NLP readiness scores:")
    print(nlp_predictions['NLP_readiness'].value_counts().sort_index())
    
    return nlp_predictions

if __name__ == "__main__":
    nlp_predictions = main()

Loading and preparing data...
Processing 99 referrals...
Loading sentence transformer model: all-MiniLM-L6-v2
Starting training...
Added 11 examples for 'precontemplation'
Added 13 examples for 'contemplation'
Added 13 examples for 'preparation'
Added 13 examples for 'action'
Added 7 examples for 'maintenance'
Total training examples: 57
Generating embeddings...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding shape: (57, 384)
Training classifier...
Training completed!
Generating NLP predictions...
Predictions saved for 99 referrals
Distribution of NLP readiness scores:
NLP_readiness
0    10
1    53
3     8
4    28
Name: count, dtype: int64
