In [4]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np
import pickle
%matplotlib inline

# Seed the random number generator:
np.random.seed(1)

In [5]:
# Load data
x_train = pickle.load(open("x_train.p", "rb"))
y_train = pickle.load(open("y_train.p", "rb"))
x_test = pickle.load(open("x_test.p", "rb"))
labels = pickle.load(open("word_labels.p", "rb"))

print(labels)

['thi' 'book' 'wa' 'one' 'read' 'movi' 'like' 'great' 'good' 'veri' 'time'
 'get' 'would' 'ha' 'love' 'hi' 'use' 'onli' 'work' 'dont' 'make' 'realli'
 'stori' 'first' 'much' 'even' 'buy' 'well' 'becaus' 'look' 'best' 'want'
 'think' 'go' 'year' 'cd' 'film' 'better' 'album' 'ani' 'way' 'product'
 'song' 'see' 'could' 'know' 'also' 'thing' 'charact' 'music' 'mani' 'tri'
 'say' 'littl' 'im' 'review' 'ever' 'new' 'recommend' 'enjoy' 'never'
 'peopl' 'bad' 'watch' 'doe' 'back' 'play' 'bought' 'give' 'still' 'find'
 'dvd' 'need' 'made' 'got' 'end' 'disappoint' 'money' 'didnt' 'old' 'two'
 'take' 'come' 'interest' 'put' 'seem' 'day' 'ive' 'life' 'thought'
 'sound' 'lot' 'cant' 'everi' 'purchas' 'star' 'found' 'feel' 'wonder'
 'fan' 'befor' 'anoth' 'qualiti' 'start' 'write' 'someth' 'version'
 'author' 'worth' 'show' 'wast' 'whi' 'doesnt' 'long' 'part' 'set' 'last'
 'differ' 'bore' 'help' 'must' 'game' 'classic' 'noth' 'novel' 'keep'
 'listen' 'order' 'problem' 'excel' 'anyon' 'expect' 'actual

In [7]:
def classification_err(y, real_y):
    """
    This function returns the classification error between two equally-sized vectors of 
    labels; this is the fraction of samples for which the labels differ.
    
    Inputs:
        y: (N, ) shaped array of predicted labels
        real_y: (N, ) shaped array of true labels
    Output:
        Scalar classification error
    """  
    return float((1 - np.sum(y == real_y))) / len(y)

def eval_tree_based_model_max_depth(clf, max_depth, X_train, y_train):
    """
    This function evaluates the given classifier (either a decision tree or random forest) at all of the 
    maximum tree depth parameters in the vector max_depth, using the given training and testing
    data. It returns two vector, with the training and testing classification errors.
    
    Inputs:
        clf: either a decision tree or random forest classifier object
        max_depth: a (T, ) vector of all the max_depth stopping condition parameters 
                            to test, where T is the number of parameters to test
        X_train: (N, D) matrix of training samples.
        y_train: (N, ) vector of training labels.
    Output:
        train_err: (T, ) vector of classification errors on the training data
        test_err: (T, ) vector of classification errors on the test data
    """
    train_err = np.zeros(len(max_depth))
    
    for i in range(len(max_depth)):
        clf.max_depth = max_depth[i]
        clf.fit(X_train, y_train)
           
        train_predictions = clf.predict(X_train)
        train_err[i] = classification_err(train_predictions, y_train)
    return train_err


In [None]:
n_estimators = 1000

clf = RandomForestClassifier(n_estimators = n_estimators, criterion = 'gini')

max_depth = np.arange(2, 21)

print(max_depth)

train_err = eval_tree_based_model_max_depth(clf, max_depth, x_train, y_train)
print(train_err)
plt.figure()
plt.plot(max_depth, test_err, label='Testing error')
plt.plot(max_depth, train_err, label='Training error')
plt.xlabel('Maximum Depth')
plt.ylabel('Classification error')
plt.title('Random Forest with Gini Impurity and Maximum Depth')
plt.legend(loc=0, shadow=True, fontsize='x-large')
plt.show()

print('Test error minimized at max_depth = %i' % max_depth[np.argmin(test_err)])