In [2]:
# Import Library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib
import json

In [3]:
# Load Dataset
data = pd.read_csv('dataset/Speed_Dating_Data.csv', encoding='latin1')
print("Shape:", data.shape)
data.head()

Shape: (8378, 195)


Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


In [4]:
# Data Cleaning
# Hapus kolom yang tidak diperlukan (misalnya id, tanggal, dll)
data = data.drop(columns=['iid', 'id', 'pid'], errors='ignore')

# Hapus baris dengan missing values berlebihan
data = data.dropna(thresh=len(data.columns)*0.8)

# Ganti NaN dengan rata-rata untuk kolom numerik
for col in data.select_dtypes(include=[np.number]).columns:
    data[col] = data[col].fillna(data[col].mean())

# Ganti NaN kategori dengan mode
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].fillna(data[col].mode()[0])

In [5]:
# Feature Engineering
# Pilih fitur penting (sesuai proposal)
features = ['gender', 'age', 'attr_o', 'sinc_o', 'fun_o', 'intel_o', 'shar_o', 'amb_o', 'like_o', 'prob_o']
target = 'match'  # variabel target (1 = cocok, 0 = tidak cocok)
data = data[features + [target]].dropna()

# Encoding kategori
le = LabelEncoder()
data['gender'] = le.fit_transform(data['gender'])

# Pisahkan X dan y
X = data.drop(columns=[target])
y = data[target]

# Normalisasi numerik
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Split Data (Training dan Testing)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [7]:
# Model Training (Random Forest dan SVM)
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# SVM
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

In [8]:
# Evaluation
def evaluate_model(y_true, y_pred, name):
    return {
        "model": name,
        "accuracy": round(accuracy_score(y_true, y_pred), 3),
        "precision": round(precision_score(y_true, y_pred), 3),
        "recall": round(recall_score(y_true, y_pred), 3),
        "f1_score": round(f1_score(y_true, y_pred), 3),
    }

rf_report = evaluate_model(y_test, rf_pred, "RandomForestClassifier")
svm_report = evaluate_model(y_test, svm_pred, "SVC")

# Gabungkan hasil
reports = {
    "RandomForest": rf_report,
    "SVM": svm_report
}

print(json.dumps(reports, indent=4))

{
    "RandomForest": {
        "model": "RandomForestClassifier",
        "accuracy": 0.822,
        "precision": 0.483,
        "recall": 0.152,
        "f1_score": 0.231
    },
    "SVM": {
        "model": "SVC",
        "accuracy": 0.82,
        "precision": 0.25,
        "recall": 0.011,
        "f1_score": 0.021
    }
}


In [9]:
# Simpan Model Terbaik
# Pilih model dengan akurasi tertinggi
best_model = rf_model if rf_report["accuracy"] >= svm_report["accuracy"] else svm_model
best_name = "RandomForest" if rf_model == best_model else "SVM"

# Simpan model dan scaler
joblib.dump(best_model, 'trained_model.joblib')
joblib.dump(scaler, 'scaler.joblib')

# Simpan laporan evaluasi
evaluation_report = {
    "best_model": best_name,
    "reports": reports
}

with open('evaluation_report.json', 'w') as f:
    json.dump(evaluation_report, f, indent=4)

print(f"✅ Model terbaik: {best_name} disimpan sebagai trained_model.joblib")

✅ Model terbaik: RandomForest disimpan sebagai trained_model.joblib
