In [7]:
import pandas as pd
from dataset import Dataset

%load_ext autoreload
%autoreload 2

## Create dataset object

In [10]:
categorical_cols = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
                   'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
                   'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
                   'stalk-surface-below-ring', 'stalk-color-above-ring',
                   'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
                   'ring-type', 'spore-print-color', 'population', 'habitat']

mushroom_dataset = Dataset("../data/mushrooms.csv", 'class', 
                           train_size=0.3, 
                           val_size=0.2,
                           test_size=0.5,
                           categorical_cols=categorical_cols,
                           classification=True
                          )

In [11]:
mushroom_dataset.print_shapes()

{'X_train': (2437, 22), 'y_test': (4062, 1), 'y_train': (2437, 1), 'X_test': (4062, 22), 'X_val': (1625, 22), 'y_val': (1625, 1)}


In [12]:
mushroom_dataset.categorical_mappings

{'bruises': {0: 'f', 1: 't'},
 'cap-color': {0: 'b',
  1: 'c',
  2: 'e',
  3: 'g',
  4: 'n',
  5: 'p',
  6: 'r',
  7: 'u',
  8: 'w',
  9: 'y'},
 'cap-shape': {0: 'b', 1: 'c', 2: 'f', 3: 'k', 4: 's', 5: 'x'},
 'cap-surface': {0: 'f', 1: 'g', 2: 's', 3: 'y'},
 'class': {0: 'e', 1: 'p'},
 'gill-attachment': {0: 'a', 1: 'f'},
 'gill-color': {0: 'b',
  1: 'e',
  2: 'g',
  3: 'h',
  4: 'k',
  5: 'n',
  6: 'o',
  7: 'p',
  8: 'r',
  9: 'u',
  10: 'w',
  11: 'y'},
 'gill-size': {0: 'b', 1: 'n'},
 'gill-spacing': {0: 'c', 1: 'w'},
 'habitat': {0: 'd', 1: 'g', 2: 'l', 3: 'm', 4: 'p', 5: 'u', 6: 'w'},
 'odor': {0: 'a',
  1: 'c',
  2: 'f',
  3: 'l',
  4: 'm',
  5: 'n',
  6: 'p',
  7: 's',
  8: 'y'},
 'population': {0: 'a', 1: 'c', 2: 'n', 3: 's', 4: 'v', 5: 'y'},
 'ring-number': {0: 'n', 1: 'o', 2: 't'},
 'ring-type': {0: 'e', 1: 'f', 2: 'l', 3: 'n', 4: 'p'},
 'spore-print-color': {0: 'b',
  1: 'h',
  2: 'k',
  3: 'n',
  4: 'o',
  5: 'r',
  6: 'u',
  7: 'w',
  8: 'y'},
 'stalk-color-above-ring': {

In [13]:
len(mushroom_dataset.X_pd[mushroom_dataset.X_pd['stalk-root'] == 0])

2480

Alot of null values for the **'stalk-root'** feature.

## DecisionTreeClassifier

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import tree

from utils import evaluate_classifier

In [23]:
clf = DecisionTreeClassifier(criterion='gini')
clf = clf.fit(mushroom_dataset.X_train, mushroom_dataset.y_train.ravel())
print(evaluate_classifier(clf, mushroom_dataset.X_test, mushroom_dataset.y_test))
print(evaluate_classifier(clf, mushroom_dataset.X_val, mushroom_dataset.y_val))

1.0
1.0


## RandomForestClassifier

In [24]:
clf = RandomForestClassifier()
clf = clf.fit(mushroom_dataset.X_train, mushroom_dataset.y_train.ravel())
print(evaluate_classifier(clf, mushroom_dataset.X_test, mushroom_dataset.y_test))
print(evaluate_classifier(clf, mushroom_dataset.X_val, mushroom_dataset.y_val))

1.0
1.0




## GradientBoostedClassifier

In [25]:
clf = GradientBoostingClassifier(learning_rate=0.1)
clf = clf.fit(mushroom_dataset.X_train, mushroom_dataset.y_train.ravel())
print(evaluate_classifier(clf, mushroom_dataset.X_test, mushroom_dataset.y_test))
print(evaluate_classifier(clf, mushroom_dataset.X_val, mushroom_dataset.y_val))

0.999015263417036
1.0
