# Random Forest Tutorial

Загрузим датасет http://archive.ics.uci.edu/ml/datasets/BlogFeedback

## Считывание и подготовка данных

In [6]:
import pandas
train = pandas.read_csv('blogData_train.csv', header=None)
print 'Data size:', train.shape
train.ix[:, :16].head()

Data size: (52397, 281)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,40.30467,53.845657,0,401,15,15.52416,32.44188,0,377,3,14.044226,32.615417,0,377,2,34.567566,48.475178
1,40.30467,53.845657,0,401,15,15.52416,32.44188,0,377,3,14.044226,32.615417,0,377,2,34.567566,48.475178
2,40.30467,53.845657,0,401,15,15.52416,32.44188,0,377,3,14.044226,32.615417,0,377,2,34.567566,48.475178
3,40.30467,53.845657,0,401,15,15.52416,32.44188,0,377,3,14.044226,32.615417,0,377,2,34.567566,48.475178
4,40.30467,53.845657,0,401,15,15.52416,32.44188,0,377,3,14.044226,32.615417,0,377,2,34.567566,48.475178


In [9]:
y = train.ix[:, train.shape[1]-1].values
X = train.ix[:, 0:train.shape[1]-2].values
print X.shape
print y.shape

(52397, 280)
(52397,)


In [10]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.9, random_state=1543)

## Сходимость моделей

In [11]:
from sklearn.metrics import mean_squared_error as mse

def get_mse_list(X, y, clf):
    predictions = list()
    mses = list()
    for i, tree in enumerate(clf.estimators_):
        predictions.append(tree.predict(X))
        rf_prediction = mean(predictions, axis=0)
        mses.append(mse(y, rf_prediction))
    print 'Last score', mses[-1]
    return mses

In [12]:
def plot_mse_graph(clf, X_train, X_test, y_train, y_test, prefix, print_train=True):
    clf.fit(X_train, y_train)
    train_mses = get_mse_list(X_train, y_train, clf)
    test_mses = get_mse_list(X_test, y_test, clf)
    if print_train:
        plot(train_mses, label=prefix + ' train set error')
    plot(test_mses, label=prefix + ' test set error')
    xlabel('n_estimators')
    ylabel('MSE')
    legend()

In [14]:
%%time
n_estimators = 50
from sklearn.ensemble import RandomForestRegressor
plot_mse_graph(RandomForestRegressor(n_estimators=n_estimators, n_jobs=3),
               X_train, X_test, y_train, y_test,
               prefix='RF')
plot_mse_graph(ExtraTreesRegressor(n_estimators=n_estimators, n_jobs=3, bootstrap=True),
               X_train, X_test, y_train, y_test,
               prefix='ET')

NameError: global name 'mean' is not defined

## Важности признаков

In [None]:
importances = RandomForestRegressor(n_estimators=n_estimators, n_jobs=3).fit(X_train, y_train).feature_importances_
plot(sorted(importances))
print sum(importances)
xlabel('feature number')
ylabel('feature importance')

## Out-of-bag score

In [None]:
predicted = RandomForestRegressor(n_estimators=n_estimators, n_jobs=3, oob_score=True).fit(X_train, y_train).oob_prediction_
print mse(y_train, predicted)

## Choosing features subset size

In [None]:
for i in range(1, 10, 2):
    plot_mse_graph(RandomForestRegressor(n_estimators=n_estimators, n_jobs=3, max_features=i),
        X_train, X_test, y_train, y_test,
        prefix='i=' + str(i),
        print_train=False)
legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)