**Tugas 1**

Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [73]:
# Import library
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [74]:
# Load dataset
mushroom_data = pd.read_csv('data/mushrooms.csv')
mushroom_data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [75]:
# Proses Encoding
label_encoder = LabelEncoder()

# Loop melalui seluruh kolom dalam dataset dan menerapkan Label Encoding
for column in mushroom_data.columns:
    mushroom_data[column] = label_encoder.fit_transform(mushroom_data[column])
    
mushroom_data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [76]:
# Split data label dan target
X = mushroom_data.drop('class', axis=1)
y = mushroom_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# Menyesuaikan hyperparameter untuk Decision Tree
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt_grid = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5)
dt_grid.fit(X_train, y_train)

best_dt = dt_grid.best_estimator_
print("Best estimator:", best_dt)

Best estimator: DecisionTreeClassifier()


In [79]:
# Menyesuaikan hyperparameter untuk RandomForest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5)
rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_
print("Best estimator:", best_rf)

Best estimator: RandomForestClassifier()


In [80]:
# Melatih model
best_dt.fit(X_train, y_train)
best_rf.fit(X_train, y_train)

# Membuat prediksi untuk data uji
y_pred_dt = best_dt.predict(X_test)
y_pred_rf = best_rf.predict(X_test)

# Evaluasi model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print("Decision Tree Accuracy: ", accuracy_dt)
print("Random Forest Accuracy: ", accuracy_rf)

Decision Tree Accuracy:  1.0
Random Forest Accuracy:  1.0


**Tugas 2**

Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [81]:
# Menyesuaikan hyperparameter untuk AdaBoost
adaboost_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}
adaboost_grid = GridSearchCV(AdaBoostClassifier(estimator=best_dt), adaboost_param_grid, cv=5)
adaboost_grid.fit(X_train, y_train)

best_adaboost = adaboost_grid.best_estimator_
print("Best estimator:", best_adaboost)

Best estimator: AdaBoostClassifier(estimator=DecisionTreeClassifier(), learning_rate=0.01)


In [82]:
# Melatih model
best_dt.fit(X_train, y_train)
best_adaboost.fit(X_train, y_train)

# Membuat prediksi untuk data uji
y_pred_dt = best_dt.predict(X_test)
y_pred_adaboost = best_adaboost.predict(X_test)

# Evaluasi model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)

print("Decision Tree Accuracy:", accuracy_dt)
print("AdaBoost Accuracy:", accuracy_adaboost)

Decision Tree Accuracy: 1.0
AdaBoost Accuracy: 1.0


**Tugas 3**

Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

In [91]:
# Load dataset
diabetes_data = pd.read_csv('data/diabetes.csv')
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [92]:
# Ganti nilai 0 dengan mean
fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

feature_columns = [col for col in diabetes_data.columns if col != "Outcome"]

diabetes_data[feature_columns] = fill_values.fit_transform(diabetes_data[feature_columns])
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [93]:
# Split data label dan target
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data.Outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
# Karena kolom DiabetesPedigreeFunction memiliki nilai yang bervariasi, kita perlu melakukan standarisasi
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

In [103]:
# Menyesuaikan hyperparameter untuk Logistic RegressionCV
logreg_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'max_iter':  [500, 1000, 5000]
}
logreg = GridSearchCV(LogisticRegression(), logreg_param_grid, cv=5)
logreg.fit(X_train, y_train)

best_logreg = logreg.best_estimator_
print("Best estimator:", best_logreg)

Best estimator: LogisticRegression(C=10, max_iter=500)


In [104]:
# Menyesuaikan hyperparameter untuk SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['poly'],
}
svm_grid = GridSearchCV(SVC(), svm_param_grid, cv=5)
svm_grid.fit(X_train, y_train)

best_svm = svm_grid.best_estimator_
print("Best estimator:", best_svm)

Best estimator: SVC(C=10, kernel='poly')


In [105]:
# Menyesuaikan hyperparameter untuk Decision Tree
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4]
}
dt_grid = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5)
dt_grid.fit(X_train, y_train)
best_dt = dt_grid.best_estimator_
print("Best estimator:", best_dt)

Best estimator: DecisionTreeClassifier(min_samples_leaf=4)


In [106]:
# Membuat model ensemble voting
ensemble_model = VotingClassifier(estimators=[
    ('logreg', best_logreg),
    ('svm', best_svm),
    ('dt', best_dt)
], voting='hard')

# Melatih model ensemble
ensemble_model.fit(X_train, y_train)

# Melakukan prediksi untuk data uji
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluasi model
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print("Ensemble Voting Accuracy:", accuracy_ensemble)

Ensemble Voting Accuracy: 0.7792207792207793
