<a href="https://colab.research.google.com/github/yunisroni/ilmudataproject/blob/main/FS_DS1_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
# Dataset https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data

from ucimlrepo import fetch_ucirepo

# fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

# metadata
print(statlog_german_credit_data.metadata)

# variable information
print(statlog_german_credit_data.variables)


{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

In [None]:
import pandas as pd

# Create data.frame
df = pd.concat([X, y], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Attribute1   1000 non-null   object
 1   Attribute2   1000 non-null   int64 
 2   Attribute3   1000 non-null   object
 3   Attribute4   1000 non-null   object
 4   Attribute5   1000 non-null   int64 
 5   Attribute6   1000 non-null   object
 6   Attribute7   1000 non-null   object
 7   Attribute8   1000 non-null   int64 
 8   Attribute9   1000 non-null   object
 9   Attribute10  1000 non-null   object
 10  Attribute11  1000 non-null   int64 
 11  Attribute12  1000 non-null   object
 12  Attribute13  1000 non-null   int64 
 13  Attribute14  1000 non-null   object
 14  Attribute15  1000 non-null   object
 15  Attribute16  1000 non-null   int64 
 16  Attribute17  1000 non-null   object
 17  Attribute18  1000 non-null   int64 
 18  Attribute19  1000 non-null   object
 19  Attribute20  1000 non-null  

In [None]:
# Identifikasi kategori variabel
categorical_cols = df.select_dtypes(include=['object']).columns

# Tampilkan kategori dan jumlahnya
for col in categorical_cols:
  print(f"Variable: {col}")
  print(df[col].value_counts())
  print("-" * 20)

Variable: Attribute1
Attribute1
A14    394
A11    274
A12    269
A13     63
Name: count, dtype: int64
--------------------
Variable: Attribute3
Attribute3
A32    530
A34    293
A33     88
A31     49
A30     40
Name: count, dtype: int64
--------------------
Variable: Attribute4
Attribute4
A43     280
A40     234
A42     181
A41     103
A49      97
A46      50
A45      22
A44      12
A410     12
A48       9
Name: count, dtype: int64
--------------------
Variable: Attribute6
Attribute6
A61    603
A65    183
A62    103
A63     63
A64     48
Name: count, dtype: int64
--------------------
Variable: Attribute7
Attribute7
A73    339
A75    253
A74    174
A72    172
A71     62
Name: count, dtype: int64
--------------------
Variable: Attribute9
Attribute9
A93    548
A92    310
A94     92
A91     50
Name: count, dtype: int64
--------------------
Variable: Attribute10
Attribute10
A101    907
A103     52
A102     41
Name: count, dtype: int64
--------------------
Variable: Attribute12
Attribute12
A1

In [None]:
from sklearn.preprocessing import LabelEncoder

# Setiap kolom kategorikal dirubah menjadi numerik dengan label encoding
label_encoders = {}
for column in df.columns:
    # Periksa variabel dengan type object
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# Tampikan data.frame
df.head()

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,...,Attribute12,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20,class
0,0,6,4,4,1169,4,4,4,2,0,...,0,67,2,1,2,2,1,1,0,1
1,1,48,2,4,5951,0,2,2,1,0,...,0,22,2,1,1,2,1,0,0,2
2,3,12,4,7,2096,0,3,2,2,0,...,0,49,2,1,1,1,2,0,0,1
3,0,42,2,3,7882,0,3,2,2,2,...,1,45,2,2,1,2,2,0,0,1
4,0,24,3,0,4870,0,2,3,2,0,...,3,53,2,2,2,2,2,0,0,2


In [None]:
# Hitung coefficient of variation dari df

import numpy as np

# Hitung deviasi standar untuk setiap kolom
std_dev = df.std()

# Hitung rata-rata untuk setiap kolom
mean = df.mean()

# Hitung koefisien variasi
coefficient_of_variation = (std_dev / mean) * 100

coefficient_of_variation

Unnamed: 0,0
Attribute1,79.748746
Attribute2,57.689396
Attribute3,42.558728
Attribute4,83.591757
Attribute5,86.289032
Attribute6,142.988472
Attribute7,50.683987
Attribute8,37.629152
Attribute9,42.097507
Attribute10,329.452544


In [None]:
# Seleksi fitur dalam df dengan chi-squared

from sklearn.feature_selection import chi2

# Pisahkan fitur (X) dan target (y)
X = df.drop('class', axis=1)  # mengambil semua kolom kecuali variabel class
y = df['class']

# Hitung statistik chi-squared dan p-value
chi2_scores, p_values = chi2(X, y)

# Buat DataFrame untuk menyimpan hasil
results = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi2_scores, 'P-value': p_values})

# Urutkan berdasarkan Chi2 Score (dari yang terbesar)
results = results.sort_values('Chi2 Score', ascending=False)

# Tampilkan hasil
print(results)

# Simpan fitur dengan skor Chi2 di atas threshold
threshold = 10  # nilai ambang batas
selected_features_chi = results[results['Chi2 Score'] > threshold]['Feature'].tolist()

print(f"\nFitur yang terpilih menggunakan chi-squared: {selected_features_chi}")

        Feature    Chi2 Score       P-value
4    Attribute5  58262.490007  0.000000e+00
1    Attribute2    321.030795  8.637197e-72
0    Attribute1    123.333665  1.178641e-28
5    Attribute6     72.269985  1.876804e-17
12  Attribute13     30.200309  3.896473e-08
2    Attribute3     24.103752  9.128229e-07
11  Attribute12     16.501718  4.860601e-05
6    Attribute7      8.232742  4.114132e-03
19  Attribute20      6.487773  1.086189e-02
13  Attribute14      3.582800  5.838066e-02
8    Attribute9      2.315724  1.280717e-01
7    Attribute8      2.204619  1.375979e-01
9   Attribute10      0.993432  3.189050e-01
18  Attribute19      0.792551  3.733300e-01
15  Attribute16      0.495516  4.814772e-01
16  Attribute17      0.240196  6.240645e-01
14  Attribute15      0.113230  7.364966e-01
3    Attribute4      0.005246  9.422614e-01
10  Attribute11      0.003766  9.510663e-01
17  Attribute18      0.001031  9.743885e-01

Fitur yang terpilih menggunakan chi-squared: ['Attribute5', 'Attribute2', '

In [None]:
# Seleksi fitur dalam df dengan Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Pisahkan fitur (X) dan target (y)
X = df.drop('class', axis=1)  # mengambil semua kolom kecuali variabel class
y = df['class']

# Inisialisasi model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Latih model Random Forest
rf_model.fit(X, y)

# Seleksi fitur menggunakan SelectFromModel
selector = SelectFromModel(estimator=rf_model, threshold='median') # threshold dengan median
selector.fit(X, y)

# Simpan fitur-fitur yang terpilih
selected_features_rf = X.columns[selector.get_support()]

print(f"Fitur yang terpilih menggunakan Random Forest: {selected_features_rf.tolist()}")

Fitur yang terpilih menggunakan Random Forest: ['Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5', 'Attribute6', 'Attribute7', 'Attribute11', 'Attribute12', 'Attribute13']


In [None]:
# Klasifikasi dengan model XGBoost untuk selected_features_rf

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Mengambil Fitur dari RF
selected_features = selected_features_rf
X_selected = X[selected_features]

# Ubah nilai y menjadi 0 dan 1
y = np.where(y == 1, 0, 1)

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Inisialisasi model XGBoost
model = XGBClassifier()

# Latih model dengan training data
model.fit(X_train, y_train)

# Lakukan prediksi dengan test data
y_pred = model.predict(X_test)

# Evaluasi performa model
# accuracy = accuracy_score(y_test, y_pred)
print("Hasil Cofusion Matrix RF dan XGB:")
print(classification_report(y_test, y_pred))

Hasil Cofusion Matrix RF dan XGB:
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       141
           1       0.70      0.51      0.59        59

    accuracy                           0.79       200
   macro avg       0.76      0.71      0.72       200
weighted avg       0.78      0.79      0.78       200



In [None]:
# Klasifikasi dengan model Naive Bayes untuk selected_features_rf

from sklearn.naive_bayes import GaussianNB

# Mengambil Fitur dari RF
selected_features = selected_features_rf
X_selected = X[selected_features]

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Inisialisasi model Naive Bayes
model = GaussianNB()

# Latih model dengan training data
model.fit(X_train, y_train)

# Lakukan prediksi dengan test data
y_pred = model.predict(X_test)

# Evaluasi performa model
# accuracy = accuracy_score(y_test, y_pred)
print("Hasil Cofusion Matrix RF dan NB:")
print(classification_report(y_test, y_pred))

Hasil Cofusion Matrix RF dan NB:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       141
           1       0.62      0.41      0.49        59

    accuracy                           0.75       200
   macro avg       0.70      0.65      0.66       200
weighted avg       0.73      0.75      0.73       200

