<a href="https://colab.research.google.com/github/yunisroni/ilmudataproject/blob/main/FS_DS2_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
# Dataset https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic

from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
print(breast_cancer_wisconsin_diagnostic.variables)


{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

In [3]:
import pandas as pd

# Create data.frame
df = pd.concat([X, y], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   radius1             569 non-null    float64
 1   texture1            569 non-null    float64
 2   perimeter1          569 non-null    float64
 3   area1               569 non-null    float64
 4   smoothness1         569 non-null    float64
 5   compactness1        569 non-null    float64
 6   concavity1          569 non-null    float64
 7   concave_points1     569 non-null    float64
 8   symmetry1           569 non-null    float64
 9   fractal_dimension1  569 non-null    float64
 10  radius2             569 non-null    float64
 11  texture2            569 non-null    float64
 12  perimeter2          569 non-null    float64
 13  area2               569 non-null    float64
 14  smoothness2         569 non-null    float64
 15  compactness2        569 non-null    float64
 16  concavit

In [4]:
# Periksa isi kolom Diagnosis

print(df['Diagnosis'].unique())

['M' 'B']


In [5]:
# Konversi nilai Diagnosis menjadi numerik
diagnosis_mapping = {'M': 1, 'B': 0}
df['Diagnosis'] = df['Diagnosis'].map(diagnosis_mapping)

# Periksa kembali isi kolom Diagnosis setelah konversi
print(df['Diagnosis'].unique())

[1 0]


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   radius1             569 non-null    float64
 1   texture1            569 non-null    float64
 2   perimeter1          569 non-null    float64
 3   area1               569 non-null    float64
 4   smoothness1         569 non-null    float64
 5   compactness1        569 non-null    float64
 6   concavity1          569 non-null    float64
 7   concave_points1     569 non-null    float64
 8   symmetry1           569 non-null    float64
 9   fractal_dimension1  569 non-null    float64
 10  radius2             569 non-null    float64
 11  texture2            569 non-null    float64
 12  perimeter2          569 non-null    float64
 13  area2               569 non-null    float64
 14  smoothness2         569 non-null    float64
 15  compactness2        569 non-null    float64
 16  concavit

In [7]:
# Hitung nilai shannon entropy dari masing-masing variabel

import numpy as np

def shannon_entropy(data):
  """Hitung nilai Shannon entropy dari data.

  Args:
    data: Data yang akan dihitung entropy-nya.

  Returns:
    Nilai Shannon entropy.
  """
  value, counts = np.unique(data, return_counts=True)
  probabilities = counts / counts.sum()
  entropy = -np.sum(probabilities * np.log2(probabilities))
  return entropy

# Hitung entropy untuk setiap kolom
for column in df.columns:
  entropy = shannon_entropy(df[column])
  print(f"Shannon entropy untuk variabel '{column}': {entropy}")

Shannon entropy untuk variabel 'radius1': 8.733008172657566
Shannon entropy untuk variabel 'texture1': 8.824000154223153
Shannon entropy untuk variabel 'perimeter1': 8.983102658639638
Shannon entropy untuk variabel 'area1': 9.045509996081337
Shannon entropy untuk variabel 'smoothness1': 8.78024997322213
Shannon entropy untuk variabel 'compactness1': 9.037153427536234
Shannon entropy untuk variabel 'concavity1': 8.99611506403242
Shannon entropy untuk variabel 'concave_points1': 9.013689756475301
Shannon entropy untuk variabel 'symmetry1': 8.62397636243383
Shannon entropy untuk variabel 'fractal_dimension1': 8.89960569026648
Shannon entropy untuk variabel 'radius2': 9.047698243001964
Shannon entropy untuk variabel 'texture2': 8.97123115160596
Shannon entropy untuk variabel 'perimeter2': 9.022232118229253
Shannon entropy untuk variabel 'area2': 9.002865598003147
Shannon entropy untuk variabel 'smoothness2': 9.074956195557899
Shannon entropy untuk variabel 'compactness2': 9.049886489922592

In [8]:
# Normalisasi nilai Shannon entropy
normalized_entropies = []
for column in df.columns:
  entropy = shannon_entropy(df[column])
  n = len(df[column])  # Panjang data dalam kolom
  normalized_entropy = entropy / np.log2(n) if np.log2(n) != 0 else 0  # Normalisasi dengan log2(n)

  normalized_entropies.append(normalized_entropy)


average_normalized_entropy = np.mean(normalized_entropies)
print(f"\nRata-rata Normalized Shannon Entropy: {average_normalized_entropy}")


Rata-rata Normalized Shannon Entropy: 0.947701064870129


In [9]:
# Seleksi fitur dalam df dengan chi-squared

from sklearn.feature_selection import chi2

# Pisahkan fitur (X) dan target (y)
X = df.drop('Diagnosis', axis=1)  # mengambil semua kolom kecuali variabel Diagnosis
y = df['Diagnosis']

# Hitung statistik chi-squared dan p-value
chi2_scores, p_values = chi2(X, y)

# Buat DataFrame untuk menyimpan hasil
results = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi2_scores, 'P-value': p_values})

# Urutkan berdasarkan Chi2 Score (dari yang terbesar)
results = results.sort_values('Chi2 Score', ascending=False)

# Tampilkan hasil
print(results)

# Simpan fitur dengan skor Chi2 di atas threshold
threshold = 10  # nilai ambang batas
selected_features_chi = results[results['Chi2 Score'] > threshold]['Feature'].tolist()

print(f"\nFitur yang terpilih menggunakan chi-squared: {selected_features_chi}")

               Feature     Chi2 Score        P-value
23               area3  112598.431564   0.000000e+00
3                area1   53991.655924   0.000000e+00
13               area2    8758.504705   0.000000e+00
22          perimeter3    3665.035416   0.000000e+00
2           perimeter1    2011.102864   0.000000e+00
20             radius3     491.689157  6.113248e-109
0              radius1     266.104917   8.013976e-60
12          perimeter2     250.571896   1.948775e-56
21            texture3     174.449400   7.896683e-40
1             texture1      93.897508   3.322922e-22
26          concavity3      39.516915   3.252301e-10
10             radius2      34.675247   3.895534e-09
6           concavity1      19.712354   9.001757e-06
25        compactness3      19.314922   1.108368e-05
27     concave_points3      13.485419   2.404244e-04
7      concave_points1      10.544035   1.165636e-03
5         compactness1       5.403075   2.010130e-02
28           symmetry3       1.298861   2.5442

In [10]:
# Seleksi fitur dalam df dengan Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Pisahkan fitur (X) dan target (y)
X = df.drop('Diagnosis', axis=1)  # mengambil semua kolom kecuali variabel Diagnosis
y = df['Diagnosis']

# Inisialisasi model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Latih model Random Forest
rf_model.fit(X, y)

# Seleksi fitur menggunakan SelectFromModel
selector = SelectFromModel(estimator=rf_model, threshold='median') # threshold dengan median
selector.fit(X, y)

# Simpan fitur-fitur yang terpilih
selected_features_rf = X.columns[selector.get_support()]

print(f"Fitur yang terpilih menggunakan Random Forest: {selected_features_rf.tolist()}")

Fitur yang terpilih menggunakan Random Forest: ['radius1', 'texture1', 'perimeter1', 'area1', 'concavity1', 'concave_points1', 'radius2', 'area2', 'radius3', 'texture3', 'perimeter3', 'area3', 'compactness3', 'concavity3', 'concave_points3']


In [11]:
# Klasifikasi dengan model XGBoost untuk selected_features_rf

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Mengambil Fitur dari RF
selected_features = selected_features_rf
X_selected = X[selected_features]

# Ubah nilai y menjadi 0 dan 1
y = np.where(y == 1, 0, 1)

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Inisialisasi model XGBoost
model = XGBClassifier()

# Latih model dengan training data
model.fit(X_train, y_train)

# Lakukan prediksi dengan test data
y_pred = model.predict(X_test)

# Evaluasi performa model
# accuracy = accuracy_score(y_test, y_pred)
print("Hasil Cofusion Matrix RF dan XGB:")
print(classification_report(y_test, y_pred))

Hasil Cofusion Matrix RF dan XGB:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [12]:
# Klasifikasi dengan model Naive Bayes untuk selected_features_rf

from sklearn.naive_bayes import GaussianNB

# Mengambil Fitur dari RF
selected_features = selected_features_rf
X_selected = X[selected_features]

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Inisialisasi model Naive Bayes
model = GaussianNB()

# Latih model dengan training data
model.fit(X_train, y_train)

# Lakukan prediksi dengan test data
y_pred = model.predict(X_test)

# Evaluasi performa model
# accuracy = accuracy_score(y_test, y_pred)
print("Hasil Cofusion Matrix RF dan NB:")
print(classification_report(y_test, y_pred))

Hasil Cofusion Matrix RF dan NB:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        43
           1       0.96      1.00      0.98        71

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

