In [1]:
import os
import pandas as pd
import numpy as np
import nltk
import textstat
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util

# Load models
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")
sent_model = SentenceTransformer('all-MiniLM-L6-v2')
sentiment_analyzer = SentimentIntensityAnalyzer()

# Paths
base_path = "data/simulated_baselines"
categories = ['healthy', 'impaired']

# Results
data = []

for category in categories:
    path = os.path.join(base_path, category)
    for fname in os.listdir(path):
        if not fname.endswith('.txt'):
            continue
        with open(os.path.join(path, fname), 'r') as f:
            text = f.read()
        
        # Tokenize
        sentences = sent_tokenize(text)
        words = word_tokenize(text.lower())
        word_count = len(words)
        unique_words = len(set(words))

        # Coherence - cosine sim between adjacent sentence embeddings
        sent_embeddings = sent_model.encode(sentences)
        if len(sent_embeddings) > 1:
            sims = [util.cos_sim(sent_embeddings[i], sent_embeddings[i+1]).item() for i in range(len(sent_embeddings)-1)]
            avg_coherence = np.mean(sims)
            std_coherence = np.std(sims)
        else:
            avg_coherence = 0
            std_coherence = 0
        
        # Readability
        flesch = textstat.flesch_reading_ease(text)
        fog = textstat.gunning_fog(text)

        # POS tagging
        doc = nlp(text)
        pos_counts = doc.count_by(spacy.attrs.POS)
        total_tokens = sum(pos_counts.values())
        noun_ratio = pos_counts.get(spacy.symbols.NOUN, 0) / total_tokens
        verb_ratio = pos_counts.get(spacy.symbols.VERB, 0) / total_tokens
        pron_ratio = pos_counts.get(spacy.symbols.PRON, 0) / total_tokens

        # Sentiment (overall + per sentence)
        sentiments = [sentiment_analyzer.polarity_scores(s)['compound'] for s in sentences]
        avg_sentiment = np.mean(sentiments)
        std_sentiment = np.std(sentiments)

        # Store results
        data.append({
            'filename': fname,
            'label': category,
            'word_count': word_count,
            'unique_words': unique_words,
            'type_token_ratio': unique_words / word_count if word_count else 0,
            'avg_coherence': avg_coherence,
            'std_coherence': std_coherence,
            'flesch_score': flesch,
            'fog_index': fog,
            'noun_ratio': noun_ratio,
            'verb_ratio': verb_ratio,
            'pron_ratio': pron_ratio,
            'avg_sentiment': avg_sentiment,
            'std_sentiment': std_sentiment
        })

df = pd.DataFrame(data)
df.to_csv("data/feature_summary.csv", index=False)
df.head()

ModuleNotFoundError: No module named 'textstat'