In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pickle

# Assume df is your DataFrame loaded already
df = pd.read_csv('file.csv')

# Drop columns that are not features (like names or IDs)
X = df.drop(columns=['status', 'nama', 'nomor', 'nik'])
y = df['status']

# Classifiers to test
classifiers = {
    "LogisticRegression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000)),
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": make_pipeline(StandardScaler(), SVC()),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "KNeighbors": make_pipeline(StandardScaler(), KNeighborsClassifier()),
}

# Evaluate each with 5-fold CV
results = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    results[name] = scores.mean()
    print(f"{name} CV Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

# Select best classifier
best_clf_name = max(results, key=results.get)
print(f"Best classifier: {best_clf_name} with accuracy {results[best_clf_name]:.4f}")

# Train best model on full data
best_clf = classifiers[best_clf_name]
best_clf.fit(X, y)

# Save model to disk
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_clf, f)

print("Best model saved as best_model.pkl")


LogisticRegression CV Accuracy: 1.0000 (+/- 0.0000)
RandomForest CV Accuracy: 0.9867 (+/- 0.0267)
SVM CV Accuracy: 1.0000 (+/- 0.0000)
DecisionTree CV Accuracy: 0.9800 (+/- 0.0400)
KNeighbors CV Accuracy: 0.9800 (+/- 0.0267)
Best classifier: LogisticRegression with accuracy 1.0000
Best model saved as best_model.pkl
