In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Load dataset
data = pd.read_csv('/Users/yashsoni/Documents/data_stories_one_shot.csv')  # Update path as needed

# Label: Stage 1 = 0 (Show), Stage 2 or 3 = 1 (Tell)
data['Target'] = data['Stage'].apply(lambda stage: 0 if stage == 1 else 1)

# Define features and labels
texts = data['Sentence'].values
labels = data['Target'].values
plot_ids = data['Plot_Name'].values

# TF-IDF Vectorizer setup
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        token_pattern=r'\b[a-z]{2,}\b')

# Classifiers to compare
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "LinearSVM": LinearSVC(class_weight='balanced'),
    "NaiveBayes": MultinomialNB(),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced')
}

# 5-Fold Stratified Cross-Validation
print("Results: 5-Fold Stratified Cross-Validation")
strat_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for model_name, model in models.items():
    pipeline = Pipeline([('vectorizer', tfidf), ('classifier', model)])
    accuracy_scores = cross_val_score(pipeline, texts, labels, cv=strat_cv, scoring='accuracy')
    print(f"{model_name}: {accuracy_scores.mean():.3f} ± {accuracy_scores.std():.3f}")

# Leave-One-Plot-Out Cross-Validation
print("\nResults: Leave-One-Plot-Out Cross-Validation")
group_cv = GroupKFold(n_splits=len(np.unique(plot_ids)))
for model_name, model in models.items():
    pipeline = Pipeline([('vectorizer', tfidf), ('classifier', model)])
    group_scores = cross_val_score(pipeline, texts, labels, cv=group_cv, groups=plot_ids, scoring='accuracy')
    print(f"{model_name}: {group_scores.mean():.3f} ± {group_scores.std():.3f}")

Results: 5-Fold Stratified Cross-Validation
LogisticRegression: 0.815 ± 0.075
LinearSVM: 0.800 ± 0.045
NaiveBayes: 0.777 ± 0.066
RandomForest: 0.792 ± 0.062

Results: Leave-One-Plot-Out Cross-Validation
LogisticRegression: 0.804 ± 0.186
LinearSVM: 0.805 ± 0.183
NaiveBayes: 0.711 ± 0.142
RandomForest: 0.623 ± 0.138
