Mendiscretisasi (diskritisasi)

Diskritisasi adalah proses mengubah nilai numerik menjadi kategori (misalnya dari angka ke “rendah”, “sedang”, “tinggi”).

Load dan Persiapkan Data

In [2]:
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['Class'] = [iris.target_names[i] for i in iris.target]
df.insert(0, 'id', range(1, len(df)+1))
df = df[['id', 'Class', 'petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']]
df.columns = ['id', 'Class', 'petal_length', 'petal_width', 'sepal_length', 'sepal_width']

print(df.to_string(index=False))

 id      Class  petal_length  petal_width  sepal_length  sepal_width
  1     setosa           1.4          0.2           5.1          3.5
  2     setosa           1.4          0.2           4.9          3.0
  3     setosa           1.3          0.2           4.7          3.2
  4     setosa           1.5          0.2           4.6          3.1
  5     setosa           1.4          0.2           5.0          3.6
  6     setosa           1.7          0.4           5.4          3.9
  7     setosa           1.4          0.3           4.6          3.4
  8     setosa           1.5          0.2           5.0          3.4
  9     setosa           1.4          0.2           4.4          2.9
 10     setosa           1.5          0.1           4.9          3.1
 11     setosa           1.5          0.2           5.4          3.7
 12     setosa           1.6          0.2           4.8          3.4
 13     setosa           1.4          0.1           4.8          3.0
 14     setosa           1.1      

Diskritisasi Semua Fitur dengan KMeans

In [7]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans

# Load data Iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# Tambahkan kolom Class dan ID
df['Class'] = [iris.target_names[i] for i in iris.target]
df.insert(0, 'id', range(1, len(df) + 1))

# Ubah urutan dan nama kolom agar lebih singkat
df = df[['id', 'Class', 'petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']]
df.columns = ['id', 'Class', 'petal_length', 'petal_width', 'sepal_length', 'sepal_width']

# Salin DataFrame untuk diskritisasi
df_discrete = df.copy()

# Mapping angka ke huruf kategori
label_map = {0: 'A', 1: 'B', 2: 'C'}

# Diskritisasi tiap fitur numerik menggunakan KMeans dan ubah ke huruf
for col in ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']:
    kmeans = KMeans(n_clusters=3, random_state=42)
    clusters = kmeans.fit_predict(df[[col]])
    
    # Urutkan berdasarkan rata-rata nilai asli per cluster
    centers = pd.DataFrame({'cluster': clusters, 'value': df[col]})
    ordered = centers.groupby('cluster').mean().sort_values('value').index
    cluster_to_label = {cluster: label_map[i] for i, cluster in enumerate(ordered)}
    
    df_discrete[col] = [cluster_to_label[c] for c in clusters]

# Tampilkan hasil akhir
print(df_discrete.to_string(index=False))

 id      Class petal_length petal_width sepal_length sepal_width
  1     setosa            A           A            A           C
  2     setosa            A           A            A           B
  3     setosa            A           A            A           B
  4     setosa            A           A            A           B
  5     setosa            A           A            A           C
  6     setosa            A           A            B           C
  7     setosa            A           A            A           C
  8     setosa            A           A            A           C
  9     setosa            A           A            A           B
 10     setosa            A           A            A           B
 11     setosa            A           A            B           C
 12     setosa            A           A            A           C
 13     setosa            A           A            A           B
 14     setosa            A           A            A           B
 15     setosa           

Klasifikasi Sebelum Diskritisasi

In [8]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['Class'] = [iris.target_names[i] for i in iris.target]
df.insert(0, 'id', range(1, len(df) + 1))
df = df[['id', 'Class', 'petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']]
df.columns = ['id', 'Class', 'petal_length', 'petal_width', 'sepal_length', 'sepal_width']

# Klasifikasi SEBELUM diskritisasi
X = df.drop(columns=['id', 'Class'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

nb = GaussianNB()
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)

dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)

# Output akurasi
print("=== Sebelum Diskritisasi (data asli) ===")
print("Naive Bayes Accuracy     :", accuracy_score(y_test, pred_nb))
print("Decision Tree Accuracy   :", accuracy_score(y_test, pred_dt))


=== Sebelum Diskritisasi (data asli) ===
Naive Bayes Accuracy     : 0.9666666666666667
Decision Tree Accuracy   : 0.9666666666666667


Klasifikasi Setelah Diskritisasi

In [9]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

# Copy data asli untuk diskritisasi
df_discrete = df.copy()
label_map = {0: 'A', 1: 'B', 2: 'C'}

# Diskritisasi semua fitur
for col in ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']:
    kmeans = KMeans(n_clusters=3, random_state=42)
    clusters = kmeans.fit_predict(df[[col]])
    
    # Urutkan cluster agar A = nilai rendah, B = sedang, C = tinggi
    centers = pd.DataFrame({'cluster': clusters, 'value': df[col]})
    ordered = centers.groupby('cluster').mean().sort_values('value').index
    cluster_to_label = {cluster: label_map[i] for i, cluster in enumerate(ordered)}
    
    df_discrete[col] = [cluster_to_label[c] for c in clusters]

# Ubah kategori A/B/C menjadi angka 0/1/2
df_encoded = df_discrete.copy()
for col in ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

# Klasifikasi SETELAH diskritisasi
X_d = df_encoded.drop(columns=['id', 'Class'])
y_d = df_encoded['Class']

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.2, random_state=1)

nb_d = GaussianNB()
nb_d.fit(X_train_d, y_train_d)
pred_nb_d = nb_d.predict(X_test_d)

dt_d = DecisionTreeClassifier(random_state=1)
dt_d.fit(X_train_d, y_train_d)
pred_dt_d = dt_d.predict(X_test_d)

# Output akurasi
print("\n=== Setelah Diskritisasi (A/B/C) ===")
print("Naive Bayes Accuracy     :", accuracy_score(y_test_d, pred_nb_d))
print("Decision Tree Accuracy   :", accuracy_score(y_test_d, pred_dt_d))



=== Setelah Diskritisasi (A/B/C) ===
Naive Bayes Accuracy     : 0.9666666666666667
Decision Tree Accuracy   : 0.9666666666666667
