In [3]:
import pandas
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import r2_score

# get data
data = pandas.read_csv('abalone.csv')

# convert Sex to int
data['Sex'] = data['Sex'].map(lambda x: 1 if x == 'M' else (-1 if x == 'F' else 0))

# prepare features
X = np.array(data[['Sex', 'Length', 'Diameter', 'Height', 'WholeWeight', 'ShuckedWeight', 'VisceraWeight', 'ShellWeight']])
y = np.array(data['Rings'])

kf = KFold(len(X), n_folds=5, random_state=1, shuffle=True)
for i in range(1, 51):
    estimator = RandomForestRegressor(random_state=1, n_estimators=i)
    print('Count of estimators: ' + str(i))
    print('Cross val: ', end = '\t')
    print(cross_val_score(estimator, X, y, cv=kf, scoring = 'r2', n_jobs=-1).mean())
    print()
    
# We can see that after count of estimators more than 21, val > 0.52. Answer: 22
# Also e can see that more count of estimators means more value, but value growing slowly

Count of estimators: 1
Cross val: 	0.102138694877

Count of estimators: 2
Cross val: 	0.338416755158

Count of estimators: 3
Cross val: 	0.403579849462

Count of estimators: 4
Cross val: 	0.442722398967

Count of estimators: 5
Cross val: 	0.464020766067

Count of estimators: 6
Cross val: 	0.470581632759

Count of estimators: 7
Cross val: 	0.475830616352

Count of estimators: 8
Cross val: 	0.481741845626

Count of estimators: 9
Cross val: 	0.488347813022

Count of estimators: 10
Cross val: 	0.494464124802

Count of estimators: 11
Cross val: 	0.493396555

Count of estimators: 12
Cross val: 	0.497965876398

Count of estimators: 13
Cross val: 	0.502136460572

Count of estimators: 14
Cross val: 	0.506428696226

Count of estimators: 15
Cross val: 	0.508331197043

Count of estimators: 16
Cross val: 	0.510513143832

Count of estimators: 17
Cross val: 	0.513848294799

Count of estimators: 18
Cross val: 	0.516327541274

Count of estimators: 19
Cross val: 	0.519034688136

Count of estimators: 20
