# Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Tugas 1
Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [3]:
#persiapan data
df = pd.read_csv('mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [5]:
label_encoder = LabelEncoder()

# Loop melalui semua kolom dalam DataFrame
for column in df.columns:
    # Periksa apakah tipe data kolom adalah objek (data kategorikal)
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])
        
#cek data yang sudah diencode
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [6]:
# Seleksi fitur
X = df.iloc[:,2:]
y = df['class']

# Cek jumlah fitur dan instance
X.shape

(8124, 21)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
from sklearn.model_selection import GridSearchCV

dt = DecisionTreeClassifier()

params = {'max_depth': [None, 10, 20, 30],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 2, 4]}

GS = GridSearchCV(estimator=dt,param_grid=params,cv=5,n_jobs=-1, verbose=True, scoring='accuracy')

# training/fit
GS.fit(X_train, y_train)

# mengambil model Decision Tree terbaik yang telah dihasilkan oleh proses Grid Search.
best_dtree = GS.best_estimator_

# Memprediksi label set test
y_pred_dt = best_dtree.predict(X_test)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy         : {} %".format(round((acc_dt*100),2)))
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Accuracy         : 100.0 %
Test set accuracy: 1.00
Test set accuracy: 1.0


Training RandomForest

In [9]:
rf = RandomForestClassifier(n_estimators=10, random_state=1, n_jobs=-1)

# Sesuaikan dt ke set training
rf.fit(X_train, y_train)

# Memprediksi label set test
y_pred_rf = rf.predict(X_test)

#  menghitung set accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy         : {} %".format(round((acc_rf*100),2)))
print("Test set accuracy: {:.2f}".format(acc_rf))
print(f"Test set accuracy: {acc_rf}")

Accuracy         : 100.0 %
Test set accuracy: 1.00
Test set accuracy: 1.0


# Tugas 2
Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [10]:
#import AdaBoost
from sklearn.ensemble import AdaBoostClassifier

In [12]:
df2 = pd.read_csv('mushrooms.csv')

df2.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [13]:
# Cek kolom null
df2.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [14]:
label_encoder = LabelEncoder()

# Loop melalui semua kolom dalam DataFrame
for column in df.columns:
    # Periksa apakah tipe data kolom adalah objek (data kategorikal)
    if df2[column].dtype == 'object':
        df2[column] = label_encoder.fit_transform(df2[column])

In [15]:
# cek data setelah diencode
df2.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [16]:
# seleksi fitur
X = df.iloc[:,2:]
y = df['class']

# Cek jumlah fitur dan instance
X.shape

(8124, 21)

Split data testing dan training

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Trainig AdaBoost

In [18]:
ada = AdaBoostClassifier(n_estimators=2)

# Sesuaikan dt ke set training
ada.fit(X_train, y_train)

# Memprediksi label set test
y_pred_ada = ada.predict(X_test)

#  menghitung set accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")

Test set accuracy: 0.84
Test set accuracy: 0.8449230769230769


# Tugas 3
Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma

- Logistic Regression
- SVM kernel polynomial
- Decission Tree

In [19]:
from sklearn.svm import SVC # import SVM classifier
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.linear_model import LogisticRegression

In [21]:
df3 = pd.read_csv('diabetes.csv')

df3.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [22]:
# Cek kolom null
df3.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [23]:
from sklearn.impute import SimpleImputer
# melakukakan imputasi kolom yang memiliki data sama dengan 0

feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

df3[feature_columns] = fill_values.fit_transform(df3[feature_columns])

Split data training dan testing

In [24]:
X = df3[feature_columns]
y = df3.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Training dengan Logistic Regression

Standarisasi fitur

In [25]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [26]:
# buat objek Logistic Regression
lr = LogisticRegression(solver='liblinear')

# hyperparameter
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 500, 1000]
}

# penyetelan hyperparameter menggunakan grid search
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_std, y_train)

# evaluasi LR dengan hyperparameter terbaik
best_lr = grid_search_lr.best_estimator_

# predeksi data testing
y_pred_lr = best_lr.predict(X_test_std)

# Evaluasi akurasi data testing
acc_lr = accuracy_score(y_test, y_pred_lr)

# Print hasil evaluasi
print("Accuracy         : {} %".format(round((acc_lr*100),2)))
print("Test set accuracy: {:.2f}".format(acc_lr))
print(f"Test set accuracy: {acc_lr}")

Accuracy         : 73.59 %
Test set accuracy: 0.74
Test set accuracy: 0.7359307359307359


Traininig dengan SVM Polynomial

In [27]:
# model svm polynomial
svm_poly = SVC(kernel='poly')

# hyperparameter
param_grid = {
    'C': [0.1, 1, 10],
    'degree': [2, 3, 4],
    'coef0': [0.0, 1.0, 2.0],
    'gamma': ['scale', 'auto']
}

# penyetelan hyperparameter menggunakan grid search
grid_search_svm_poly = GridSearchCV(estimator=svm_poly, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search_svm_poly.fit(X_train_std, y_train)

# evaluasi svm polynomial dengan hyperparameter terbaik
best_svm_poly = grid_search_svm_poly.best_estimator_

# predeksi data testing
y_pred_svm_poly = best_svm_poly.predict(X_test_std)

# Evaluasi akurasi data testing
acc_svm_poly = accuracy_score(y_test, y_pred_svm_poly)

# Print hasil evaluasi
print("Accuracy         : {} %".format(round((acc_svm_poly*100),2)))
print("Test set accuracy: {:.2f}".format(acc_svm_poly))
print(f"Test set accuracy: {acc_svm_poly}")

Accuracy         : 74.46 %
Test set accuracy: 0.74
Test set accuracy: 0.7445887445887446


Training dengan Decission Tree

In [28]:
dt = DecisionTreeClassifier()

params = {'max_depth': [None, 10, 20, 30],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 2, 4]}

GS = GridSearchCV(estimator=dt,param_grid=params,cv=5,n_jobs=-1, verbose=True, scoring='accuracy')

# training/fit
GS.fit(X_train_std, y_train)

# mengambil model Decision Tree terbaik yang telah dihasilkan oleh proses Grid Search.
best_dtree = GS.best_estimator_

# Memprediksi label set test
y_pred_dt = best_dtree.predict(X_test_std)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy         : {} %".format(round((acc_dt*100),2)))
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Accuracy         : 75.76 %
Test set accuracy: 0.76
Test set accuracy: 0.7575757575757576


Training dengan Voting

In [29]:
# Definisikan algoritma yang akan digunakan untuk voting
clf1 = best_lr
clf2 = best_svm_poly
clf3 = best_dtree

# model hard voting
voting = VotingClassifier(estimators=[('logistic_reg', clf1), ('SVM-POLY', clf2), ('decission_tree', clf3)], voting='hard')

# Fit model
voting.fit(X_train_std, y_train)

# Prediksi
y_pred_vt1 = voting.predict(X_test_std)

# Evaluasi akurasi testing data
acc_vt1 = accuracy_score(y_test, y_pred_vt1)

# Print hasil evaluasi
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")

Voting Hard
Test set accuracy: 0.75
Test set accuracy: 0.7489177489177489
