In [130]:
import os
import mushrooms_data
import cars_data
import sklearn
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.ensemble import RandomForestClassifier
from random import shuffle
from sklearn import tree
import pydotplus

# Loading and formating datasets
Random Forest needs data and target separatedly

In [2]:
def get_values_to_number(data):
    dictionary = {}
    for column in data.columns:
        dictionary[column] = {}
        corresponding_number = 0
        checked = []
        for value in data[column]:
            if value not in checked:
                checked.append(value)
                dictionary[column][value] = corresponding_number
                corresponding_number += 1
    return data.replace(dictionary)

In [64]:
def get_data(dataset):
    complete_data = get_values_to_number(dataset.get_data())
    complete_data = complete_data.sample(frac=1)
    data = complete_data.drop(['label'], axis=1).values.tolist()
    targets = complete_data['label'].values.tolist()
    return complete_data, data, targets

In [4]:
#print(dataset)

# Creating train and test sets

In [4]:
def create_train_and_test_sets(data, targets, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=test_size, random_state=1)
    return X_train, X_test, y_train, y_test

# Training and get the scores using Random Forest and cross-validation

In random forests, each tree in the ensemble is built from a sample drawn with replacement (i.e., a bootstrap sample) from the training set. In addition, when splitting a node during the construction of the tree, the split that is chosen is no longer the best split among all features. Instead, the split that is picked is the best split among a random subset of the features. As a result of this randomness, the bias of the forest usually slightly increases (with respect to the bias of a single non-random tree) but, due to averaging, its variance also decreases, usually more than compensating for the increase in bias, hence yielding an overall better model.

## Parameters to test
--> Random Forest

- n_estimators - integer, optional (default=10): number of trees in the forest.

- criterion - string, optional (default=”gini”): The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

- max_features - int, float, string or None, optional (default=”auto”): number of features to consider when looking for the best split:

        If int, then consider max_features features at each split.
        If float, then max_features is a percentage and int(max_features * n_features) features are considered at each split.
        If “auto”, then max_features=sqrt(n_features).
        If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
        If “log2”, then max_features=log2(n_features).
        If None, then max_features=n_features.

    Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
    
--> Cross-validation

- cv : int, cross-validation generator or an iterable, optional

    Determines the cross-validation splitting strategy. Possible inputs for cv are:

            None, to use the default 3-fold cross validation,
            integer, to specify the number of folds in a (Stratified)KFold,
            An object to be used as a cross-validation generator.
            An iterable yielding train, test splits.
            
- scoring: ‘accuracy’, ‘average_precision’, ‘f1’, ‘precision’, ‘recall’


### Mushrooms dataset 

In [114]:
dataset, data, targets = get_data(mushrooms_data)
X_train, X_test, y_train, y_test = create_train_and_test_sets(data, targets)

#### Checking data balancing

In [115]:
pd.Series(targets).value_counts()

1    4208
0    3916
dtype: int64

In [116]:
pd.Series(y_train).value_counts()

1    3359
0    3140
dtype: int64

In [117]:
pd.Series(y_test).value_counts()

1    849
0    776
dtype: int64

In [119]:
clf = RandomForestClassifier()
# parameters = {'n_estimators': [5, 10, 20, 40, 100], 'criterion': ['gini','entropy']}
#                               [30,40,50]              --> 40 de novo
#                               [35,40,45]              --> 45
#                               [42,45,47]              --> 45
#                               [100,200,300]           --> 200
#                               [150,200,500]           --> 500
#                               [5,50,100,300,500,1000] --> 50
#                               [40,50,200,500,1000]    --> 200
# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            #max_depth=None, max_features='auto', max_leaf_nodes=None,
            #min_impurity_split=1e-07, min_samples_leaf=1,
            #min_samples_split=2, min_weight_fraction_leaf=0.0,
            #n_estimators=40, n_jobs=1, oob_score=False, random_state=None,
            #verbose=0, warm_start=False)
parameters = {'n_estimators': [45]}
clf = GridSearchCV(clf, parameters)
clf.fit(X_train, y_train)
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=45, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [120]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [121]:
confusion_matrix(y_test, y_pred)

array([[776,   0],
       [  0, 849]])

In [122]:
scores = cross_val_score(clf, data, targets, cv=10)
scores

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

### Cars dataset

In [123]:
dataset, data, targets = get_data(cars_data)
X_train, X_test, y_train, y_test = create_train_and_test_sets(data, targets)

#### Checking data balancing

In [124]:
pd.Series(targets).value_counts()

0    1210
1     384
3      69
2      65
dtype: int64

In [125]:
clf = RandomForestClassifier(n_estimators=650)
# parameters = {'n_estimators': [10,50,100,200,500,1000], 'criterion': ['gini','entropy']}
#                               [75,100,150]        --> 75
#                               [60,70,80]          --> 60
#                               [50,60,70]          --> 60
#                               [60,150,300,700,1000, 2000] --> 700
#                               [600,700,800]          --> 600
#                               [500,600,700]          --> 700
#                               [600,650,700]          --> 650
# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            #max_depth=None, max_features='auto', max_leaf_nodes=None,
            #min_impurity_split=1e-07, min_samples_leaf=1,
            #min_samples_split=2, min_weight_fraction_leaf=0.0,
            #n_estimators=40, n_jobs=1, oob_score=False, random_state=None,
            #verbose=0, warm_start=False)
#parameters = {'n_estimators': [650]}
#clf = GridSearchCV(clf, parameters)
clf = clf.fit(X_train, y_train)
#clf.best_estimator_

In [126]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.97687861271676302

In [127]:
confusion_matrix(y_test, y_pred)

array([[243,   3,   0,   0],
       [  1,  71,   0,   0],
       [  0,   4,   8,   0],
       [  0,   0,   0,  16]])

In [128]:
scores = cross_val_score(clf, data, targets, cv=10)
scores

array([ 0.99425287,  0.98850575,  0.98850575,  0.99425287,  0.99421965,
        0.96511628,  0.99418605,  0.99418605,  0.97674419,  0.97076023])

In [107]:
i_tree = 0 
for tree_in_forest in clf.estimators_:
    if (i_tree <1):        
        tree.export_graphviz(tree_in_forest, feature_names=dataset.columns, filled=True, rounded=True, out_file='tree.dot')
        os.system('dot -Tpng tree.dot -o tree.png')
        i_tree = i_tree + 1