In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)

# 2 samples, 3 features
X = [[ 1,  2,  3],
     [11, 12, 13]]

# labels
y = [0, 1]

# train model
clf.fit(X, y)

# unseen data
Z = [[ 4,  5,  6], 
     [14, 15, 16]]

# make a prediction
clf.predict(Z)

In [None]:
from sklearn.preprocessing import Binarizer
X = [[0.5, 5.0, 0.2],
     [0.1, 1.0, 0.3],
     [0.7, 0.2, 0.4],
     [0.4, 3.0, 0.3]]

# binarize the matrix
Binarizer(threshold=0.3).fit_transform(X)

In [None]:
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# generate dummy regression data
X, y = make_regression(n_samples=1000, random_state=0)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# transform training set
X_train_scaled = StandardScaler().fit_transform(X_train)

# create and train a model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# transform testing set
X_test_scaled = StandardScaler().fit_transform(X_test)
pred = lr.predict(X_test_scaled)

# evaluate the model
mean_squared_error(pred, y_test)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# create a pipeline object
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

# load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# fit the whole pipeline
pipe.fit(X_train, y_train)

# we can now use it like any other estimator
accuracy_score(pipe.predict(X_test), y_test)

In [None]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()

result = cross_validate(lr, X, y)  # defaults to 5-fold CV
result['test_score']  # r_squared score is high because dataset is easy

## Making a custom estimator and transformer

A very important way to interact with scikit-learn is by creating classes that implement compatible APIs.

Let's see a trivial example of a simple estimator that learns the mean from a 1-dimensional dataset, and "predicts" by adding or subtracting the mean (which is decided by a parameter)

In [None]:
from sklearn.base import BaseEstimator
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted


class OneDimensionalMeanRegressor(BaseEstimator):
    
    def __init__(self, operator='sum'):
        self.operator = operator
        
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        
        self.n_features_in_ = X.shape[1]
        
        self.X_ = X
        
        self.mean_ = X.mean()
        
        return self
    
    def predict(self, X):
        check_is_fitted(self)
        
        X = check_array(X)
        
        r = None
        if self.operator == 'sum':
            r = X + self.mean_
        else:
            r = X - self.mean_
            
        return r

In [None]:
import numpy as np
rng = np.random.default_rng()
X = rng.random(size=100)[:,np.newaxis]
y = np.zeros(100)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
odmr = OneDimensionalMeanRegressor(operator='sub')

In [None]:
odmr.fit(X, y).predict(X_test)

In [None]:
odmr.get_params()

In [None]:
check_estimator(odmr)