#### Tugas 1

Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma **Decision Tree** dan **RandomForest**. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

***Jawab***

##### Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

##### Persiapan Data

In [2]:
# Load data
df = pd.read_csv('data/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Cek Kolom Null

In [3]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

##### Encoding

In [4]:
# Encode all categorical columns using LabelEncoder
label_encoders = {}
for column in df.columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [5]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


##### Seleksi Fitur

In [6]:
# Seleksi fitur

X = df.drop(['class'], axis=1)
y = df['class']

##### Split Data Training dan Data Testing

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

##### **Traning Decision Tree**

In [8]:
dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 1.00
Test set accuracy: 1.0


In [9]:
# Hyperparameter tuning for Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier()
grid_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)

best_dt = grid_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Test set accuracy: {:.2f}".format(acc_dt))
print("Best Parameters for Decision Tree:", grid_dt.best_params_)

Decision Tree Test set accuracy: 1.00
Best Parameters for Decision Tree: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


##### **Training RandomForest**

In [10]:
rf = RandomForestClassifier(n_estimators=10, random_state=1)

# Sesuaikan dt ke set training
rf.fit(X_train, y_train)

# Memprediksi label set test
y_pred_rf = rf.predict(X_test)

#  menghitung set accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Test set accuracy: {:.2f}".format(acc_rf))
print(f"Test set accuracy: {acc_rf}")

Test set accuracy: 1.00
Test set accuracy: 1.0


In [11]:
# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=1)
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Test set accuracy: {:.2f}".format(acc_rf))
print("Best Parameters for Random Forest:", grid_rf.best_params_)

Random Forest Test set accuracy: 1.00
Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


#### Tugas 2

Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

***Jawab***

##### Import Library

In [28]:
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost
from sklearn.naive_bayes import GaussianNB

##### **Traning Decision Tree**

In [13]:
dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 1.00
Test set accuracy: 1.0


In [14]:
# Hyperparameter tuning for Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier()
grid_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)

best_dt = grid_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Test set accuracy: {:.2f}".format(acc_dt))
print("Best Parameters for Decision Tree:", grid_dt.best_params_)

Decision Tree Test set accuracy: 1.00
Best Parameters for Decision Tree: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


##### **Traning AdaBoost**

In [15]:
ada = AdaBoostClassifier(n_estimators=2)

# Sesuaikan dt ke set training
ada.fit(X_train, y_train)

# Memprediksi label set test
y_pred_ada = ada.predict(X_test)

#  menghitung set accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")

Test set accuracy: 0.84
Test set accuracy: 0.8449230769230769


In [16]:
# Hyperparameter tuning for AdaBoost
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'base_estimator__max_depth': [1, 2, 3]  # This specifies the max depth of the base estimator (usually a Decision Tree)
}

# Create a base estimator (e.g., Decision Tree)
dt = DecisionTreeClassifier()

# Create the AdaBoost classifier
ada = AdaBoostClassifier(base_estimator=dt, random_state=1)

grid_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, scoring='accuracy', cv=5)
grid_ada.fit(X_train, y_train)

best_ada = grid_ada.best_estimator_
y_pred_ada = best_ada.predict(X_test)
acc_ada = accuracy_score(y_test, y_pred_ada)
print("AdaBoost Test set accuracy: {:.2f}".format(acc_ada))
print("Best Parameters for AdaBoost:", grid_ada.best_params_)

AdaBoost Test set accuracy: 1.00
Best Parameters for AdaBoost: {'base_estimator__max_depth': 1, 'learning_rate': 1.0, 'n_estimators': 50}


In [17]:
# Hyperparameter tuning for AdaBoost
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'base_estimator__max_depth': [1, 2, 3]  # This specifies the max depth of the base estimator (usually a Decision Tree)
}

# Create a base estimator (e.g., Decision Tree)
dt = DecisionTreeClassifier()

# Create the AdaBoost classifier
ada = AdaBoostClassifier(base_estimator=dt, random_state=1)

grid_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, scoring='accuracy', cv=5)
grid_ada.fit(X_train, y_train)

best_ada = grid_ada.best_estimator_
y_pred_ada = best_ada.predict(X_test)
acc_ada = accuracy_score(y_test, y_pred_ada)
print("AdaBoost Test set accuracy: {:.2f}".format(acc_ada))
print("Best Parameters for AdaBoost:", grid_ada.best_params_)

AdaBoost Test set accuracy: 1.00
Best Parameters for AdaBoost: {'base_estimator__max_depth': 1, 'learning_rate': 1.0, 'n_estimators': 50}


#### Tugas 3

Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

***Jawab***

##### Import Library

In [18]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

##### Persiapan Data

In [19]:
# Load Data

di = pd.read_csv('data/diabetes.csv')

di.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


##### Nama Kolom

In [21]:
# Cek nama kolom
di.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

##### Cek Kolom Null

In [22]:
# Cek kolom null
di.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [23]:
# Cek kolom neng nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(di.loc[di[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [24]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

di[feature_columns] = fill_values.fit_transform(di[feature_columns])

In [25]:
di.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


##### Split Data Training dan Data Testing

In [26]:
X = di[feature_columns]
y = di.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

##### Standarisasi Fitur

In [27]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

##### **Ensemble Voting**

In [37]:
# Definisikan algoritma yang akan digunakan untuk voting
clf1 = LogisticRegression(random_state=1)
clf2 = SVC(kernel='poly', degree=3, C=1, random_state=1)
clf3 = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=2, random_state=1)

# Model voting dengan hard voting (majority vote)
voting = VotingClassifier(estimators=[
    ('logistic_regression', clf1),
    ('svm_poly', clf2),
    ('decision_tree', clf3)
], voting='hard')

# Fit model ensemble voting
voting.fit(X_train_std, y_train)

# Prediksi
y_pred_voting = voting.predict(X_test_std)

# Evaluasi akurasi pada data pengujian
acc_voting = accuracy_score(y_test, y_pred_voting)

# Cetak hasil evaluasi
print("Ensemble Voting Test set accuracy: {:.2f}".format(acc_voting))

Ensemble Voting Test set accuracy: 0.75
