In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pickle

# 📌 Load your data
df = pd.read_csv('D:/inductive_deductive/backend/auto_labeled_responses.csv')

# Drop rows with NaN in target
df = df.dropna(subset=['label'])

# Separate features and target
X = df.drop('label', axis=1)
y = df['label']

# Encode target
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

# Optional: Add noise to make features less predictable
X += np.random.normal(0, 0.05, size=X.shape)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Preprocessing pipeline
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Define models with regularization
models = {
    'RandomForest': RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42),
    'LogisticRegression': LogisticRegression(C=0.5, max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(max_depth=4),
    'SVM': SVC(C=0.5, kernel='rbf', probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'GaussianNB': GaussianNB()
}

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("🔍 Comparing models...\n")
best_model = None
best_score = 0

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    model.fit(X_train, y_train)
    test_score = accuracy_score(y_test, model.predict(X_test))

    print(f"{name} ➤ CV Accuracy: {scores.mean():.4f}, Test Accuracy: {test_score:.4f}")

    if test_score > best_score:
        best_score = test_score
        best_model = model

# Save best model
print(f"\n✅ Best model: {best_model.__class__.__name__} with accuracy: {best_score:.4f}")

with open('D:/inductive_deductive/backend/model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('D:/inductive_deductive/backend/preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

with open('D:/inductive_deductive/backend/target_encoder.pkl', 'wb') as f:
    pickle.dump(target_encoder, f)


🔍 Comparing models...

RandomForest ➤ CV Accuracy: 0.8786, Test Accuracy: 0.8611
LogisticRegression ➤ CV Accuracy: 0.9714, Test Accuracy: 0.8889
DecisionTree ➤ CV Accuracy: 0.8357, Test Accuracy: 0.8889
SVM ➤ CV Accuracy: 0.8500, Test Accuracy: 0.8056
KNN ➤ CV Accuracy: 0.7857, Test Accuracy: 0.7778
GaussianNB ➤ CV Accuracy: 0.8643, Test Accuracy: 0.7500

✅ Best model: LogisticRegression with accuracy: 0.8889
