In [16]:
# Import library yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')

# 1. LOAD DATASET
df = pd.read_csv('dataset_buys _comp.csv')


In [17]:
# 2. EKSPLORASI DATA
print("Informasi Dataset:")
print(df.info())
print("\nStatistik Deskriptif:")
print(df.describe())

# Cek nilai yang hilang
print("\nJumlah nilai yang hilang:")
print(df.isnull().sum())

# Melihat distribusi target
print("\nDistribusi kelas target:")
print(df['Buys_Computer'].value_counts(normalize=True))

# Visualisasi distribusi target
plt.figure(figsize=(8, 6))
sns.countplot(x='Buys_Computer', data=df)
plt.title('Distribusi Kelayakan Kredit')
plt.xlabel('Kelayakan')
plt.ylabel('Jumlah')
plt.savefig('distribusi_kelayakan.png')
plt.close()

Informasi Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            1000 non-null   object
 1   Income         1000 non-null   object
 2   Student        1000 non-null   object
 3   Credit_Rating  1000 non-null   object
 4   Buys_Computer  1000 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 39.2+ KB
None

Statistik Deskriptif:
       Buys_Computer
count    1000.000000
mean        0.669000
std         0.470809
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000

Jumlah nilai yang hilang:
Age              0
Income           0
Student          0
Credit_Rating    0
Buys_Computer    0
dtype: int64

Distribusi kelas target:
Buys_Computer
1    0.669
0    0.331
Name: proportion, dtype: float64


In [18]:
# 3. PREPROCESSING DATA
X = df.iloc[:, :-1]  
y = df.iloc[:, -1] 

# Membagi fitur menjadi kategorikal dan numerikal
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

print(f"\nFitur Kategorikal: {categorical_cols.tolist()}")
print(f"Fitur Numerikal: {numerical_cols.tolist()}")


Fitur Kategorikal: ['Age', 'Income', 'Student', 'Credit_Rating']
Fitur Numerikal: []


In [19]:
# 4. FEATURE ENGINEERING & ENCODING
# Membuat preprocessor menggunakan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [20]:
# 5. PEMBAGIAN DATA (TRAIN-TEST SPLIT)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nJumlah data training: {X_train.shape[0]}")
print(f"Jumlah data testing: {X_test.shape[0]}")


Jumlah data training: 800
Jumlah data testing: 200


In [21]:
# 6. PEMBUATAN MODEL DECISION TREE
# Membuat pipeline dengan preprocessor dan model
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

In [22]:
# 7. PELATIHAN MODEL
dt_pipeline.fit(X_train, y_train)

# 8. EVALUASI AWAL MODEL
y_pred = dt_pipeline.predict(X_test)

# Metrik evaluasi
print("\nEvaluasi Model Awal:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('confusion_matrix.png')
plt.close()


Evaluasi Model Awal:
Accuracy: 0.8100
Precision: 0.8322
Recall: 0.8100
F1 Score: 0.8146


In [23]:
# 9. HYPERPARAMETER TUNING
# Menggunakan Grid Search untuk menemukan parameter terbaik
param_grid = {
    'classifier__max_depth': [None, 5, 10, 15, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(
    dt_pipeline,
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Menampilkan parameter terbaik
print("\nParameter terbaik:")
print(grid_search.best_params_)



Parameter terbaik:
{'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}


In [24]:
# 10. MODEL FINAL DENGAN PARAMETER TERBAIK
best_dt_pipeline = grid_search.best_estimator_
y_pred_best = best_dt_pipeline.predict(X_test)

# Evaluasi model final
print("\nEvaluasi Model Final (Setelah Hyperparameter Tuning):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_best, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_best, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_best, average='weighted'):.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))



Evaluasi Model Final (Setelah Hyperparameter Tuning):
Accuracy: 0.8100
Precision: 0.8322
Recall: 0.8100
F1 Score: 0.8146

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.85      0.75        66
           1       0.91      0.79      0.85       134

    accuracy                           0.81       200
   macro avg       0.79      0.82      0.80       200
weighted avg       0.83      0.81      0.81       200



In [25]:

# 11. VISUALISASI DECISION TREE (jika ukuran tree tidak terlalu besar)
# Mengambil model Decision Tree dari pipeline
dt_classifier = best_dt_pipeline.named_steps['classifier']

# Mendapatkan feature names setelah preprocessing
feature_names = []
for name, _, cols in preprocessor.transformers_:
    if name == 'cat':
        encoder = preprocessor.named_transformers_['cat']
        cats = encoder.get_feature_names_out(categorical_cols)
        feature_names.extend(cats)
    else:
        feature_names.extend(numerical_cols)

# Visualisasi Decision Tree
plt.figure(figsize=(20, 10))
tree.plot_tree(dt_classifier, feature_names=feature_names, filled=True, max_depth=3)
plt.title('Decision Tree (depth=3)')
plt.savefig('decision_tree.png', dpi=300, bbox_inches='tight')
plt.close()


In [26]:
# 12. FEATURE IMPORTANCE
# Melihat feature importance
if hasattr(dt_classifier, 'feature_importances_'):
    importances = dt_classifier.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 8))
    plt.title('Feature Importances')
    plt.bar(range(len(importances)), importances[indices], align='center')
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    print("\nFeature Importances:")
    for i in range(len(importances)):
        print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")


Feature Importances:
Age_Tua: 0.3081
Credit_Rating_Baik: 0.1449
Student_Tidak: 0.1364
Income_Tinggi: 0.1272
Age_Muda: 0.1006
Age_Paruh Baya: 0.0696
Credit_Rating_Buruk: 0.0620
Student_Ya: 0.0266
Income_Rendah: 0.0195
Income_Sedang: 0.0052


In [27]:
# 13. VALIDASI SILANG (CROSS-VALIDATION)
cv_scores = cross_val_score(best_dt_pipeline, X, y, cv=5, scoring='f1_weighted')
print("\nCross-Validation Scores:")
print(f"F1 scores: {cv_scores}")
print(f"Mean F1 score: {cv_scores.mean():.4f}")
print(f"Standard deviation: {cv_scores.std():.4f}")


Cross-Validation Scores:
F1 scores: [0.79731838 0.86285406 0.78156863 0.80035405 0.76704206]
Mean F1 score: 0.8018
Standard deviation: 0.0328


In [28]:
# 14. KURVA ROC (untuk kasus biner)
# Jika kelayakan_kredit adalah masalah biner (ya/tidak)
try:
    y_proba = best_dt_pipeline.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.savefig('roc_curve.png')
    plt.close()
    
    print(f"\nAUC-ROC: {roc_auc:.4f}")
except:
    print("\nKurva ROC tidak dapat dibuat (mungkin bukan masalah biner atau format target tidak sesuai)")



AUC-ROC: 0.9292


In [29]:
# 15. SIMPAN MODEL
import joblib
joblib.dump(best_dt_pipeline, 'decision_tree_credit_model.pkl')
print("\nModel berhasil disimpan sebagai 'decision_tree_credit_model.pkl'")

# 16. CONTOH PREDIKSI
# Ambil beberapa sample untuk prediksi
sample = X_test.iloc[:5]
predictions = best_dt_pipeline.predict(sample)
print("\nContoh Prediksi:")
for i, pred in enumerate(predictions):
    print(f"Sampel {i+1}: Prediksi = {pred}")

print("\nProses pemodelan selesai!")


Model berhasil disimpan sebagai 'decision_tree_credit_model.pkl'

Contoh Prediksi:
Sampel 1: Prediksi = 1
Sampel 2: Prediksi = 1
Sampel 3: Prediksi = 1
Sampel 4: Prediksi = 1
Sampel 5: Prediksi = 0

Proses pemodelan selesai!
