In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import datasets
from sklearn import preprocessing
import pandas as pd

### Iris

In [2]:
dataset = datasets.load_iris()

features = dataset.data
targets = dataset.target

feature_train, feature_test, target_train, target_test = train_test_split(features, targets, test_size=.3)

In [3]:
model1 = RandomForestClassifier(n_estimators=1000, max_features='sqrt')
model1.fit(feature_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [4]:
predictions = model1.predict(feature_test)

print(confusion_matrix(target_test, predictions))
print(accuracy_score(target_test, predictions))

[[14  0  0]
 [ 0 14  0]
 [ 0  3 14]]
0.933333333333


### credit - default

In [5]:
credit_data = pd.read_csv("data/credit_data.csv")

features = credit_data[["income","age","loan","LTI"]]
targets = credit_data.default

# features = preprocessing.MinMaxScaler().fit_transform(features)

feature_train, feature_test, target_train, target_test = train_test_split(features, targets, test_size=.2)

In [6]:
model2 = RandomForestClassifier(n_estimators=1000, max_features='sqrt')
model2.fit(feature_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
predictions = model2.predict(feature_test)

print(confusion_matrix(target_test, predictions))
print(accuracy_score(target_test, predictions))

[[336   0]
 [  0  64]]
1.0


### Digit _ Grid Search

In [8]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict

In [9]:
dataset = datasets.load_digits()

image_features = dataset.images.reshape((len(dataset.images), -1))
image_targets = dataset.target

In [10]:
feature_train, feature_test, target_train, target_test = train_test_split(image_features, image_targets, test_size=0.2)

In [11]:
random_forest_model = RandomForestClassifier(n_jobs=-1, max_features='sqrt')

In [12]:
param_grid = {
    "n_estimators" : [10,100,500,1000],
    "max_depth" : [1,5,10,15],
    "min_samples_leaf" : [1,2,3,4,5,10,15,20,30,40,50]              
}

# elapsed : 25 min
grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=10, verbose=1)
grid_search.fit(feature_train, target_train)
print(grid_search.best_params_)

Fitting 10 folds for each of 176 candidates, totalling 1760 fits


[Parallel(n_jobs=1)]: Done 1760 out of 1760 | elapsed: 24.9min finished


{'max_depth': 15, 'min_samples_leaf': 1, 'n_estimators': 500}


In [13]:
optimal_estimators = grid_search.best_params_.get("n_estimators")
optimal_depth = grid_search.best_params_.get("max_depth")
optimal_leaf = grid_search.best_params_.get("min_samples_leaf")

In [14]:
best_model = RandomForestClassifier(n_estimators=optimal_estimators, 
                                    max_depth=optimal_depth, 
                                    max_features='sqrt', 
                                    min_samples_leaf = optimal_leaf)

In [15]:
k_fold = KFold(n_splits=10, random_state=123)

In [16]:
predictions = cross_val_predict(best_model, feature_test, target_test, cv=k_fold)
print("Accuracy of the tuned model: ", accuracy_score(target_test, predictions))

Accuracy of the tuned model:  0.944444444444
