<a href="https://colab.research.google.com/github/wachicode/LE2_Casapao_S/blob/main/LE2_Casapao_S.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data"
columns = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health', 'class']
data = pd.read_csv(url, names=columns)

data.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [38]:
data.isnull().sum()
data.dropna(inplace=True)

data.duplicated().sum()
data.drop_duplicates(inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12960 entries, 0 to 12959
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   parents   12960 non-null  object
 1   has_nurs  12960 non-null  object
 2   form      12960 non-null  object
 3   children  12960 non-null  object
 4   housing   12960 non-null  object
 5   finance   12960 non-null  object
 6   social    12960 non-null  object
 7   health    12960 non-null  object
 8   class     12960 non-null  object
dtypes: object(9)
memory usage: 911.4+ KB


In [39]:
data_encoded = pd.get_dummies(data.drop('class', axis=1))
target = data['class']

X_train, X_test, y_train, y_test = train_test_split(data_encoded, target, test_size=0.3, random_state=42)

In [40]:
# classifier 1
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [41]:
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:2f}')

report = classification_report(y_test, y_pred, digits=4)
print(report)

Accuracy: 0.987140
              precision    recall  f1-score   support

   not_recom     1.0000    1.0000    1.0000      1320
    priority     0.9796    0.9827    0.9812      1272
   recommend     0.0000    0.0000    0.0000         2
  spec_prior     0.9817    0.9916    0.9866      1190
  very_recom     0.9778    0.8462    0.9072       104

    accuracy                         0.9871      3888
   macro avg     0.7878    0.7641    0.7750      3888
weighted avg     0.9866    0.9871    0.9867      3888



In [42]:
# classifier 2
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

In [43]:
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred,)
print(f'Accuracy: {accuracy:2f}')

report = classification_report(y_test, y_pred, digits=4)
print(report)

Accuracy: 0.934414
              precision    recall  f1-score   support

   not_recom     1.0000    1.0000    1.0000      1320
    priority     0.8990    0.9025    0.9007      1272
   recommend     0.0000    0.0000    0.0000         2
  spec_prior     0.9037    0.9227    0.9131      1190
  very_recom     0.8816    0.6442    0.7444       104

    accuracy                         0.9344      3888
   macro avg     0.7369    0.6939    0.7117      3888
weighted avg     0.9338    0.9344    0.9336      3888



In [44]:
# ensemble
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)

ensemble_classifier = VotingClassifier(estimators=[
      ('rf', rf_classifier),
      ('svm', svm_classifier)
], voting='soft')

ensemble_classifier.fit(X_train, y_train)

In [45]:
y_pred = ensemble_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred,)
print(f'Accuracy: {accuracy:2f}')

report = classification_report(y_test, y_pred, digits=4)
print(report)

Accuracy: 0.969136
              precision    recall  f1-score   support

   not_recom     1.0000    1.0000    1.0000      1320
    priority     0.9508    0.9568    0.9538      1272
   recommend     0.0000    0.0000    0.0000         2
  spec_prior     0.9553    0.9697    0.9625      1190
  very_recom     0.9625    0.7404    0.8370       104

    accuracy                         0.9691      3888
   macro avg     0.7737    0.7334    0.7506      3888
weighted avg     0.9687    0.9691    0.9685      3888

