# Tugas Praktikum Week-06

## Tugas 1
Terdapat dataset <b>mushroom</b>. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

Import Library

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Load Data

In [8]:
# Load data
df = pd.read_csv('assets/dataset/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Preprocessing Data

In [9]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [10]:
# selanjutnya yaitu memisahkan label dan fiturnya
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [11]:
from sklearn.preprocessing import LabelEncoder

def feature_encode(df):
    encode = LabelEncoder()
    encoded_df = df.copy()  # Buat salinan DataFrame untuk menghindari peringatan

    for col in encoded_df.columns:
        encoded_df[col] = encode.fit_transform(encoded_df[col])

    return encoded_df

X = feature_encode(X)
display(X)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,3,2,4,0,5,0,0,0,11,0,...,2,5,5,0,1,1,4,0,1,2
8120,5,2,4,0,5,0,0,0,11,0,...,2,5,5,0,0,1,4,0,4,2
8121,2,2,4,0,5,0,0,0,5,0,...,2,5,5,0,1,1,4,0,1,2
8122,3,3,4,0,8,1,0,1,0,1,...,1,7,7,0,2,1,0,7,4,2


Split Data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Training Data

In [13]:
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

Menggunakan Tuning Hyperparameter

In [14]:
from sklearn.model_selection import GridSearchCV

# Contoh tuning hyperparameter untuk Decision Tree
dt_param_grid = {'max_depth': [None, 10, 20, 30],
                 'min_samples_split': [2, 5, 10]}
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5)
dt_grid_search.fit(X_train, y_train)
best_dt_model = dt_grid_search.best_estimator_

# Contoh tuning hyperparameter untuk Random Forest
rf_param_grid = {'n_estimators': [10, 50, 100],
                 'max_depth': [None, 10, 20, 30],
                 'min_samples_split': [2, 5, 10]}
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_


Hasil Training

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluasi Decision Tree
dt_predictions = best_dt_model.predict(X_test)
print("Clasification Report Decision Tree:\n", classification_report(y_test, dt_predictions))

conf_matrix = confusion_matrix(y_test, dt_predictions)
print("Confusion Matrix:\n", conf_matrix)

# Evaluasi Random Forest
rf_predictions = best_rf_model.predict(X_test)
print("\nClasification Report Random Forest:\n", classification_report(y_test, rf_predictions))

conf_matrix = confusion_matrix(y_test, rf_predictions)
print("Confusion Matrix:\n", conf_matrix)


Clasification Report Decision Tree:
               precision    recall  f1-score   support

           e       1.00      1.00      1.00       820
           p       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
 [[820   0]
 [  0 805]]

Clasification Report Random Forest:
               precision    recall  f1-score   support

           e       1.00      1.00      1.00       820
           p       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
 [[820   0]
 [  0 805]]


## Tugas 2

Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan performa antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [16]:
from sklearn.ensemble import AdaBoostClassifier
ada_classifier = AdaBoostClassifier()
ada_classifier.fit(X_train, y_train)

In [17]:
ada_param_grid = {'n_estimators': [50, 100, 200],
                  'learning_rate': [0.01, 0.1, 1.0]}
ada_grid_search = GridSearchCV(AdaBoostClassifier(), ada_param_grid, cv=5)
ada_grid_search.fit(X_train, y_train)
best_ada_model = ada_grid_search.best_estimator_

In [18]:
# Evaluasi Ada Boost
ada_predictions = best_rf_model.predict(X_test)
print("\nClasification Report Ada Boost:\n", classification_report(y_test, ada_predictions))

conf_matrix = confusion_matrix(y_test, ada_predictions)
print("Confusion Matrix:\n", conf_matrix)


Clasification Report Ada Boost:
               precision    recall  f1-score   support

           e       1.00      1.00      1.00       820
           p       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
 [[820   0]
 [  0 805]]


## Tugas 3

Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

Import Library

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

Load Data

In [20]:
# Load data
df = pd.read_csv('assets/dataset/diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Preprocessing Data

In [21]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Memisahkan Label dan Fitur

In [22]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

Standarisasi Fitur

In [23]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

Split Data

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Inisialisasi Model

In [25]:
logistic_regression = LogisticRegression(C=0.01, random_state=42)
svm_polynomial = SVC(kernel='poly', C=1, degree=3, random_state=42)
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=4, random_state=42)

Menggunakan Ensemble Voting

In [26]:
ensemble_classifier = VotingClassifier(
    estimators=[
        ('lr', logistic_regression),
        ('svm', svm_polynomial),
        ('dt', decision_tree)
    ],
    voting='hard'
)

Training Ensemble Classifier

In [27]:
ensemble_classifier.fit(X_train, y_train)

Hasil Training

In [29]:
y_pred = ensemble_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi Ensemble Classifier: {accuracy:.2f}')

Akurasi Ensemble Classifier: 0.79
