# Overview Materi

Source: https://www.youtube.com/watch?v=LDRbO9a6XPU

Jelaskan secara singkat apa itu decision tree menurut pemahamanmu!

# Import Data & Libraries

In [None]:
from __future__ import print_function

# label kolom
header = ["color", "diameter", "label"]

# data training
training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],]

# data testing
testing_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 4, 'Apple'],
    ['Red', 2, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],]

# Fungsi Dasar

In [None]:
# fungsi mencari apa saja unique value dari suatu kolom
def unique_vals(rows, col):
  return set([row[col]for row in rows])
# contoh penggunaan
print(unique_vals(training_data, 0))
print(unique_vals(training_data, 1))
print(unique_vals(training_data, 2))

{'Green', 'Red', 'Yellow'}
{1, 3}
{'Grape', 'Lemon', 'Apple'}


In [None]:
# fungsi Menghitung jumlah unique value dari suatu kolom
def class_counts(rows):

    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts
# contoh penggunaan
print(class_counts(training_data))

{'Apple': 2, 'Grape': 2, 'Lemon': 1}


In [None]:
# fungsi pengecekan suatu value numerik atau bukan
def is_numeric(value):
     return isinstance(value, int) or isinstance(value, float)
#

# contoh penggunaan
print(is_numeric(7))
print(is_numeric("Red"))

True
False


In [None]:
# kelas untuk merepresentasikan pertanyaan pada decision tree
class Question:

    # inisialisasi kolom dan nilai pertanyaan
    def __init__(self, column, value):
        self.column = column
        self.value = value

    # mengecek apakah nilai pada baris sesuai dengan pertanyaan


    # mengecek apakah contoh data sesuai dengan pertanyaan
    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    # menampilkan pertanyaan dalam format string yang mudah dibaca
    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))
#

# contoh penggunaan 1
Question(1, 3)

# contoh penggunaan 2
q = Question(0, 'Green')
q

Is color == Green?

In [None]:
# membagi dataset menjadi dua berdasarkan pertanyaan
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

# contoh penggunaan
true_rows, false_rows = partition(training_data, Question(0, 'Red'))
print(true_rows)
print(false_rows)

[['Red', 1, 'Grape'], ['Red', 1, 'Grape']]
[['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']]


**apa itu gini impurity?**
<br> gini impurity berfungsi mengukur tingkat ketidakmurnian atau ketidakteraturan pada sebuah simpul (node) dalam pohon

In [None]:
# menghitung nilai Gini Impurity untuk sebuah dataset
def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity



# contoh penggunaan
print(gini(training_data))



0.6399999999999999


**apa itu information gain?**
<br> information gain berfungsi mengukur seberapa efektif sebuah fitur dalam memisahkan data berdasarkan kelas-kelasnya

In [None]:
# menghitung nilai Information Gain dari pemisahan dataset
def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

# contoh penggunaan
current_uncertainty = gini(training_data)
true_rows, false_rows = partition(training_data, Question(0, 'Green'))
info_gain(true_rows, false_rows, current_uncertainty)

0.1399999999999999

In [30]:

def find_best_split(rows):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1

    # splitting the dataset
    def partition(rows, question):
        true_rows, false_rows = [], []
        for row in rows:
            if question.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows

    for col in range(n_features):
        values = set([row[col] for row in rows])

        for val in values:
            question = Question(col, val)

            # divide the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the dataset
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            if gain >= best_gain:
                best_gain, best_question = gain, question

                # return the best information gain and the corresponding question
    return best_gain, best_question

# contoh penggunaan
best_gain, best_question = find_best_split(training_data)
print(best_gain)
print(best_question)






NameError: name 'training_data' is not defined

# Fungsi Decision Tree

In [5]:
# merepresentasikan node daun (leaf) pada decision tree yang berisi hasil prediksi
class Leaf:

    # inisialisasi leaf dengan menghitung jumlah kemunculan tiap kelas
    def __init__(self, rows):
      self.predictions = class_counts(rows)



In [11]:
# merepresentasikan node keputusan (decision node) yang berisi pertanyaan dan cabang
class Decision_Node:

    # inisialisasi node dengan pertanyaan, cabang benar, dan cabang salah
    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        ...


In [6]:
# membangun decision tree secara rekursif
def build_tree(rows):
    ...
    return Decision_Node(question, true_branch, false_branch)

In [29]:
# mencetak struktur decision tree secara rekursif dalam format teks
def print_tree(node, spacing=""):

    # base case: jika sudah mencapai leaf
    if isinstance(node, Leaf):
        print(spacing + "Prediksi:", node.predictions)
        return

    # mencetak pertanyaan pada node saat ini
    print(spacing + str(node.question))

    # mencetak cabang true secara rekursif
    print(spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # mencetak cabang false secara rekursif
    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ") "

# contoh penggunaan
my_tree = build_tree(training_data)
print_tree(my_tree)

SyntaxError: unterminated string literal (detected at line 18) (ipython-input-282842461.py, line 18)

In [31]:
# mengklasifikasikan satu baris data menggunakan decision tree
def classify(row, node):

    # base case: jika sudah mencapai leaf
    if isinstance(node, Leaf):
        return node.predictions

    # menentukan apakah mengikuti cabang true atau cabang false
    # dengan membandingkan nilai fitur pada baris dengan pertanyaan di node
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)


# menghitung jumlah label dalam dataset
def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]  # label ada di kolom terakhir
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


In [12]:
# menampilkan prediksi pada leaf dalam format persentase
def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

# contoh penggunaan
testing_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 4, 'Apple'],
    ['Red', 2, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]

# Predict Using Decision Tree

In [23]:


# ==============================

# Leaf node (simpul akhir)
class Leaf:
    def __init__(self, predictions):
        self.predictions = predictions  # dict berisi jumlah label

# Decision node (simpul pertanyaan)
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

# Pertanyaan (kondisi untuk split)
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, row):
        val = row[self.column]
        return val == self.value  # untuk contoh sederhana pakai ==

    def __repr__(self):
        return f"Apakah kolom[{self.column}] == {self.value}?"


def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Prediksi:", node.predictions)
        return

    print(spacing + str(node.question))

    print(spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")


# Klasifikasi satu baris data
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)


# ==============================
# 3. Contoh tree manual
# ==============================

# Leaf nodes
leaf1 = Leaf({"Apple": 3})   # kalau kondisi True
leaf2 = Leaf({"Orange": 2})  # kalau kondisi False

# Root node dengan pertanyaan
root = Decision_Node(
    Question(0, "Green"),  # cek apakah kolom[0] == "Green"
    leaf1,
    leaf2
)

# ==============================
# 4. Cetak struktur tree
# ==============================
print("Struktur Decision Tree:")
print_tree(root)

# ==============================
# 5. Uji dengan data baru
# ==============================
testing_data = [
    ["Green", 3, "Apple"],
    ["Yellow", 3, "Orange"],
    ["Green", 2, "Apple"],
]

print("\nPrediksi Data Uji:")
for row in testing_data:
    prediction = classify(row, root)
    print("Data:", row, "=> Prediksi:", prediction)


Struktur Decision Tree:
Apakah kolom[0] == Green?
--> True:
  Prediksi: {'Apple': 3}
--> False:
  Prediksi: {'Orange': 2}

Prediksi Data Uji:
Data: ['Green', 3, 'Apple'] => Prediksi: {'Apple': 3}
Data: ['Yellow', 3, 'Orange'] => Prediksi: {'Orange': 2}
Data: ['Green', 2, 'Apple'] => Prediksi: {'Apple': 3}
