<a href="https://colab.research.google.com/github/yunisroni/ilmudataproject/blob/main/FS_DS3_CHI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [60]:
# Dataset https://archive.ics.uci.edu/dataset/78/page+blocks+classification

from ucimlrepo import fetch_ucirepo

# fetch dataset
page_blocks_classification = fetch_ucirepo(id=78)

# data (as pandas dataframes)
X = page_blocks_classification.data.features
y = page_blocks_classification.data.targets

# metadata
print(page_blocks_classification.metadata)

# variable information
print(page_blocks_classification.variables)


{'uci_id': 78, 'name': 'Page Blocks Classification', 'repository_url': 'https://archive.ics.uci.edu/dataset/78/page+blocks+classification', 'data_url': 'https://archive.ics.uci.edu/static/public/78/data.csv', 'abstract': 'The problem consists of classifying all the blocks of the page layout of a document that has been detected by a segmentation process.', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 5473, 'num_features': 10, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Fri Mar 15 2024', 'dataset_doi': '10.24432/C5J590', 'creators': ['Donato Malerba'], 'intro_paper': None, 'additional_info': {'summary': 'The 5473 examples comes from 54 distinct documents. Each observation concerns one block. All attributes are numeric. Data are in a format readable by C4.5.'

In [61]:
import pandas as pd

# Create data.frame
df = pd.concat([X, y], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   height    5473 non-null   int64  
 1   length    5473 non-null   int64  
 2   area      5473 non-null   int64  
 3   eccen     5473 non-null   float64
 4   p_black   5473 non-null   float64
 5   p_and     5473 non-null   float64
 6   mean_tr   5473 non-null   float64
 7   blackpix  5473 non-null   int64  
 8   blackand  5473 non-null   int64  
 9   wb_trans  5473 non-null   int64  
 10  class     5473 non-null   int64  
dtypes: float64(4), int64(7)
memory usage: 470.5 KB


In [62]:
# Periksa isi kolom class

print(df['class'].unique())

[1 2 4 5 3]


In [63]:
# Hitung nilai shannon entropy dari masing-masing variabel

import numpy as np

def shannon_entropy(data):
  """Hitung nilai Shannon entropy dari data.

  Args:
    data: Data yang akan dihitung entropy-nya.

  Returns:
    Nilai Shannon entropy.
  """
  value, counts = np.unique(data, return_counts=True)
  probabilities = counts / counts.sum()
  entropy = -np.sum(probabilities * np.log2(probabilities))
  return entropy

# Hitung entropy untuk setiap kolom
for column in df.columns:
  entropy = shannon_entropy(df[column])
  print(f"Shannon entropy untuk variabel '{column}': {entropy}")

Shannon entropy untuk variabel 'height': 3.8086700476437265
Shannon entropy untuk variabel 'length': 7.567619278325599
Shannon entropy untuk variabel 'area': 9.430987836447464
Shannon entropy untuk variabel 'eccen': 9.50189834043902
Shannon entropy untuk variabel 'p_black': 8.819481331477856
Shannon entropy untuk variabel 'p_and': 8.481449746114148
Shannon entropy untuk variabel 'mean_tr': 8.50236241340674
Shannon entropy untuk variabel 'blackpix': 8.920098701308842
Shannon entropy untuk variabel 'blackand': 9.980625685906928
Shannon entropy untuk variabel 'wb_trans': 7.845827416365102
Shannon entropy untuk variabel 'class': 0.6354628258660037


In [64]:
# Normalisasi nilai Shannon entropy
normalized_entropies = []
for column in df.columns:
  entropy = shannon_entropy(df[column])
  n = len(df[column])  # Panjang data dalam kolom
  normalized_entropy = entropy / np.log2(n) if np.log2(n) != 0 else 0  # Normalisasi dengan log2(n)

  normalized_entropies.append(normalized_entropy)


average_normalized_entropy = np.mean(normalized_entropies)
print(f"\nRata-rata Normalized Shannon Entropy: {average_normalized_entropy}")


Rata-rata Normalized Shannon Entropy: 0.6112366413558321


In [67]:
# Seleksi fitur dalam df dengan chi-squared

from sklearn.feature_selection import chi2

# Pisahkan fitur (X) dan target (y)
X = df.drop('class', axis=1)  # mengambil semua kolom kecuali variabel class
y = df['class']

# Hitung statistik chi-squared dan p-value
chi2_scores, p_values = chi2(X, y)

# Buat DataFrame untuk menyimpan hasil
results = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi2_scores, 'P-value': p_values})

# Urutkan berdasarkan Chi2 Score (dari yang terbesar)
results = results.sort_values('Chi2 Score', ascending=False)

# Tampilkan hasil
print(results)

# Simpan fitur dengan skor Chi2 di atas threshold
threshold = 10  # nilai ambang batas
selected_features_chi = results[results['Chi2 Score'] > threshold]['Feature'].tolist()

print(f"\nFitur yang terpilih menggunakan chi-squared: {selected_features_chi}")

    Feature    Chi2 Score       P-value
2      area  1.666104e+07  0.000000e+00
7  blackpix  7.249989e+06  0.000000e+00
8  blackand  6.460088e+06  0.000000e+00
9  wb_trans  1.293638e+05  0.000000e+00
3     eccen  1.110086e+05  0.000000e+00
6   mean_tr  1.022354e+05  0.000000e+00
0    height  4.546258e+04  0.000000e+00
1    length  1.906412e+04  0.000000e+00
4   p_black  1.582963e+02  3.390714e-33
5     p_and  1.985897e+01  5.324514e-04

Fitur yang terpilih menggunakan chi-squared: ['area', 'blackpix', 'blackand', 'wb_trans', 'eccen', 'mean_tr', 'height', 'length', 'p_black', 'p_and']


In [68]:
# Seleksi fitur dalam df dengan Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Pisahkan fitur (X) dan target (y)
X = df.drop('class', axis=1)  # mengambil semua kolom kecuali variabel class
y = df['class']

# Inisialisasi model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Latih model Random Forest
rf_model.fit(X, y)

# Seleksi fitur menggunakan SelectFromModel
selector = SelectFromModel(estimator=rf_model, threshold='median') # threshold dengan median
selector.fit(X, y)

# Simpan fitur-fitur yang terpilih
selected_features_rf = X.columns[selector.get_support()]

print(f"Fitur yang terpilih menggunakan Random Forest: {selected_features_rf.tolist()}")

Fitur yang terpilih menggunakan Random Forest: ['height', 'eccen', 'p_black', 'mean_tr', 'wb_trans']


In [69]:
# Klasifikasi dengan model XGBoost untuk selected_features_chi

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Mengambil fitur dari chi-squared
selected_features = selected_features_chi
X_selected = X[selected_features]

# Ubah nilai y menjadi 0 dan 1
y = np.where(y == 1, 0, 1)

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Inisialisasi model XGBoost
model = XGBClassifier()

# Latih model dengan training data
model.fit(X_train, y_train)

# Lakukan prediksi dengan test data
y_pred = model.predict(X_test)

# Evaluasi performa model
# accuracy = accuracy_score(y_test, y_pred)
print("Hasil Cofusion Matrix Chi-Squared dan XGB:")
print(classification_report(y_test, y_pred))

Hasil Cofusion Matrix Chi-Squared dan XGB:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       979
           1       0.84      0.89      0.87       116

    accuracy                           0.97      1095
   macro avg       0.92      0.93      0.92      1095
weighted avg       0.97      0.97      0.97      1095



In [70]:
# Klasifikasi dengan model Naive Bayes untuk selected_features_chi

from sklearn.naive_bayes import GaussianNB

# Mengambil fitur dari chi-squared
selected_features = selected_features_chi
X_selected = X[selected_features]

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Inisialisasi model Naive Bayes
model = GaussianNB()

# Latih model dengan training data
model.fit(X_train, y_train)

# Lakukan prediksi dengan test data
y_pred = model.predict(X_test)

# Evaluasi performa model
# accuracy = accuracy_score(y_test, y_pred)
print("Hasil Cofusion Matrix Chi-Squared dan NB:")
print(classification_report(y_test, y_pred))

Hasil Cofusion Matrix Chi-Squared dan NB:
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       979
           1       0.52      0.49      0.50       116

    accuracy                           0.90      1095
   macro avg       0.73      0.72      0.72      1095
weighted avg       0.90      0.90      0.90      1095

