## Scikit-Learn

#### Fitting and predicting: estimator basics

In [1]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(random_state=0)
X=[[1,2,3],[11,12,13]]
y=[0,1]
clf.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [2]:
clf.predict(X)

array([0, 1])

In [4]:
clf.predict([[4,5,6],[14,15,16]])

array([0, 1])

#### Transformers and pre-processors

In [5]:
from sklearn.preprocessing import StandardScaler

In [7]:
X=[[0,15],[1,-10]]
StandardScaler().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

#### Pipelines: chaining pre-processors and estimators

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
pipe=make_pipeline(StandardScaler(),LogisticRegression(random_state=0))
X,y=load_iris(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=0,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [17]:
accuracy_score(pipe.predict(X_test),y_test)

0.9777777777777777

#### Model evaluation

In [18]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [20]:
X,y=make_regression(n_samples=1000,random_state=0)
lr=LinearRegression()
result=cross_validate(lr,X,y)
result['test_score']

array([1., 1., 1., 1., 1.])

#### Automatic parameter searches

In [22]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [24]:
X,y=fetch_california_housing(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [25]:
param_distributions={'n_estimators':randint(1,5),'max_depth':randint(5,10)}
search=RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),n_iter=5,param_distributions=param_distributions,random_state=0)

In [26]:
search.fit(X_train,y_train)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                           

In [28]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [29]:
search.score(X_test,y_test)

0.7349601117850644