In [None]:
# lets play around with sklearn 

# reference https://scikit-learn.org/stable/getting_started.html#getting-started

# Scikit-learn is an open source machine learning library that supports supervised and unsupervised learning. 
# It also provides various tools for model fitting, data preprocessing, model selection and evaluation, 
# and many other utilities.

# few keywords to know before we start 

# What is Machine Learning?

# Machine learning is a subset of artificial intelligence (AI) in which algorithms learn by example 
# from historical data to predict outcomes and uncover patterns not easily spotted by humans. 

# What is Model Fitting?
# Model fitting is a measure of how well a machine learning model generalizes 
# to similar data to that on which it was trained. 
# A model that is well-fitted produces more accurate outcomes.
# A model that is overfitted matches the data too closely. 
# A model that is underfitted doesn’t match closely enough.

# What does Prediction mean in Machine Learning?
# “Prediction” refers to the output of an algorithm after it has been trained on a historical dataset 
# and applied to new data when forecasting the likelihood of a particular outcome, 
# The algorithm will generate probable values for an unknown variable for each record in the new data, 
# allowing the model builder to identify what that value will most likely be.

# Cross-validation is an extension of the training, validation, 
# and holdout (TVH) process that minimizes the sampling bias of machine learning models. 

In [1]:
# Fitting and predicting: estimator basics

# Scikit-learn provides dozens of built-in machine learning algorithms and models, called estimators. 
# Each estimator can be fitted to some data using its fit method.

# we fit a RandomForestClassifier to some very basic data:

# what is random forest classifier 
# Random forests or random decision forests are an ensemble learning method for classification, 
# regression and other tasks that operate by constructing a multitude of decision trees at training time 
# and outputting the class that is the mode of the classes (classification) 
# or mean/average prediction (regression)of the individual trees.

# what is a ensemble learning method ?
# In statistics and machine learning, ensemble methods use multiple learning algorithms to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone.
# Unlike a statistical ensemble in statistical mechanics, which is usually infinite, a machine learning ensemble consists of only a concrete finite set of alternative models, 
# but typically allows for much more flexible structure to exist among those alternatives.

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)

In [2]:
X = [[ 1,  2,  3, 4], [11, 12, 13, 14], [21, 22, 23, 24]] # 3 samples, 4 features


In [3]:
y = [0, 1, 2] # classes for each sample 


In [4]:
'''
The fit method generally accepts 2 inputs:

The samples matrix (or design matrix) X. 
The size of X is typically (n_samples, n_features), 
which means that samples are represented as rows and features are represented as columns.

The target values y which are real numbers for regression tasks, or integers for classification 
(or any other discrete set of values). For unsupervized learning tasks, 
y does not need to be specified. 
y is usually 1d array where the i th entry corresponds to the target of the i th sample (row) of X.
'''
clf.fit(X, y)

RandomForestClassifier(random_state=0)

In [5]:
# prediction on training data shall return us the expected values
clf.predict(X) 

array([0, 1, 2])

In [8]:
# prediction on the new data 
clf.predict([[4, 5, 6, 9], [14, 15, 16, 19], [24, 25, 26, 29] ])

array([0, 1, 2])

In [9]:
# Transformers and pre-processors 

# Machine learning workflows are often composed of different parts. 
# A typical pipeline consists of a pre-processing step that transforms or imputes the data, 
# and a final predictor that predicts target values.

# pre-processors and transformers follow the same API as the estimator objects 
# (they actually all inherit from the same BaseEstimator class). 
# The transformer objects don’t have a predict method but rather a transform method that
# outputs a newly transformed sample matrix X:

from sklearn.preprocessing import StandardScaler
X = [ [0,2,3], [-1,0,10], [9,-10,-10]]

In [16]:
# Compute the mean and std to be used for later scaling.
fx = StandardScaler().fit(X)
fx.transform(X)

array([[-0.59299945,  0.88900089,  0.2413554 ],
       [-0.81537425,  0.50800051,  1.08609928],
       [ 1.4083737 , -1.3970014 , -1.32745468]])

In [19]:
fx.mean_

array([ 2.66666667, -2.66666667,  1.        ])

In [15]:
fx.transform([[4, -5, 6], [-14, 15, 16], [24, 25, -26] ])

array([[ 0.29649973, -0.44450044,  0.60338849],
       [-3.70624658,  3.36550337,  1.81016547],
       [ 4.74399563,  5.27050527, -3.25829785]])

In [20]:
# Pipelines: chaining pre-processors and estimators

# Transformers and estimators (predictors) can be combined together into a single unifying object: a Pipeline. 
# The pipeline offers the same API as a regular estimator: it can be fitted and used for prediction with fit and predict. 


from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [21]:
# create a pipeline object
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(random_state=0)
)



In [22]:
# load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)



In [23]:
# fit the whole pipeline
pipe.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(random_state=0))])

In [24]:

# we can now use it like any other estimator
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

In [25]:
# Model evaluation

# Fitting a model to some data does not entail that it will predict well on unseen data. 
# This needs to be directly evaluated. 
# We have just seen the train_test_split helper that splits a dataset into train and test sets, 
# but scikit-learn provides many other tools for model evaluation, in particular for cross-validation.

from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [29]:
X, y = make_regression(n_samples=1000, random_state=0)

array([[ 0.41929687, -1.5489299 ,  0.65218686, ..., -0.81368398,
        -2.03884275,  0.90000294],
       [-2.06947249,  0.72712806,  0.0975975 , ..., -0.35978104,
        -0.74513907, -0.55050613],
       [-0.37595997,  0.66414405,  1.02239232, ...,  0.50481546,
        -2.83201187, -0.79978614],
       ...,
       [-0.7719197 , -1.33667649, -0.72733814, ..., -0.59830311,
        -0.60986158,  1.69242973],
       [ 0.67198393, -1.50733364,  1.17622157, ...,  2.05921537,
        -1.11140442,  0.01787532],
       [ 1.10334268, -0.59531919, -0.29831814, ..., -0.89706521,
        -0.11546748, -1.299286  ]])

In [30]:
lr = LinearRegression()

In [32]:
result = cross_validate(lr, X, y)
result

{'fit_time': array([0.01299667, 0.008919  , 0.00655627, 0.00681496, 0.00844073]),
 'score_time': array([0.00229025, 0.0009501 , 0.00089979, 0.00081778, 0.00110316]),
 'test_score': array([1., 1., 1., 1., 1.])}

In [33]:
# Automatic parameter searches 

# All estimators have parameters (often called hyper-parameters in the literature) that can be tuned. 
# The generalization power of an estimator often critically depends on a few parameters. 
# Quite often, it is not clear what the exact values of these parameters should be since they depend on the data at hand.

# Scikit-learn provides tools to automatically find the best parameter combinations (via cross-validation).

# In the following example, we randomly search over the parameter space of a random forest with a RandomizedSearchCV object. 
# When the search is over, the RandomizedSearchCV behaves as a RandomForestRegressor that has been fitted with the best set of parameters. 
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint




In [34]:
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [35]:
# define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1, 5),
                       'max_depth': randint(5, 10)}

In [36]:
# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)

In [37]:
search.fit(X_train, y_train)

RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8a29435af0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8a29678d90>},
                   random_state=0)

In [38]:
# get the best parameters 
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [39]:
# the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
search.score(X_test, y_test)

0.735363411343253