In [10]:
# Fitting and predicting: estimator basics

from sklearn.ensemble import RandomForestClassifier

In [2]:
clf = RandomForestClassifier(random_state=0)

In [3]:
x = [[1,2,3], [10,11,12]]
y = [0, 1]
clf.fit(x, y)

RandomForestClassifier(random_state=0)

In [4]:
clf.predict(x)

array([0, 1])

In [5]:
clf.predict([[4,5,6],[14,15,16]])

array([0, 1])

In [6]:
clf.predict([[4,5,6],[14,15,16],[-1,-2,-3]])

array([0, 1, 0])

In [7]:
# Transformers and pre-processors

from sklearn.preprocessing import StandardScaler
X = [[0,15], [1, -10]]
scaler = StandardScaler()
scaler.fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

In [11]:
# Pipelines: chaining pre-processors and estimators

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [12]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [14]:
x, y = load_iris(return_X_y=True) # In some math notation, matrix variable is written by capital (e.g. X).
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

In [15]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [16]:
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

In [17]:
# Model evaluation
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [18]:
X, y = make_regression(n_samples=100, random_state=0)
lr = LinearRegression()

In [22]:
result = cross_validate(lr, X, y, cv=10, return_train_score=True)
result['test_score']

array([0.93351425, 0.78418281, 0.91394781, 0.68917166, 0.90738457,
       0.98073245, 0.64911078, 0.93550583, 0.77686059, 0.8761918 ])

In [23]:
# Automatic parameter search

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [25]:
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [26]:
param_distributions = {'n_estimators': randint(1, 5), 'max_depth': randint(5, 10)}

In [27]:
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5, param_distributions=param_distributions, random_state=0)

In [28]:
search.fit(X_train, y_train)

RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x105c3b4f0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1398d62b0>},
                   random_state=0)

In [29]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [30]:
search.score(X_test, y_test)

0.735363411343253