In [1]:
import warnings
warnings.filterwarnings('ignore')

#### <b><span style="color:#f283b3;">01. Supervised Learning
###### <b>Supervised learning</b> is a type of machine learning where a model is trained <b>using labeled data.</b> In this approach, each training example includes an input paired with the correct output, allowing the model to learn the relationship between inputs and their corresponding outputs. By learning from these labeled examples, the model can then make predictions on new, unseen data. The main goal in supervised learning is <b>to find a mapping from inputs to outputs that minimizes prediction errors.</b> It’s widely used for tasks such as classification (e.g., spam detection, image recognition) and regression (e.g., predicting house prices, forecasting sales).

>Linear models

###### <b><span style="color:green;">Ordinary Least Square

In [2]:
# Basic Formula
from sklearn import linear_model # type: ignore
reg = linear_model.LinearRegression() #build model
reg.fit([[0,0],[1,1],[2,2]],[0,1,2]) # fit to train model
print("Intercept (B0):", reg.intercept_) #intercept (constant)
print("Coefficients of B1 and B2:", reg.coef_) # Coefficients

Intercept (B0): 2.220446049250313e-16
Coefficients of B1 and B2: [0.5 0.5]


In [3]:
# Example
# Code source: Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)


>##### <b><span style="color:#f283b3;">02. Unsupervised Learning

>##### <b><span style="color:#f283b3;">03. Model seletcion and evaluation

>##### <b><span style="color:#f283b3;">04. Inspection

>##### <b><span style="color:#f283b3;">05. Visualizations

>##### <b><span style="color:#f283b3;">06. Dataset transformations

##### <b><span style="color:#f283b3;">07. Dataset loading utilities<br>
###### There are four types of dataset loading methods: toy datasets, real-world datasets, generated datasets, and loading other datasets; <b>The dataset loaders</b> can be used to load small, standard datasets, as described in the Toy Datasets section. <b>The dataset fetchers</b> are used to download and load larger datasets, which are covered in the Real World Datasets section. <b>The dataset generation functions</b> allow you to create controlled synthetic datasets, discussed in the Generated Datasets section.

>Toy datasets

In [None]:
from sklearn.datasets import load_iris 
from sklearn.datasets import load_diabetes 
from sklearn.datasets import load_digits
from sklearn.datasets import load_linnerud #Load and return the physical exercise Linnerud dataset.
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_cancer

In [16]:
display(load_iris) #show default form
display(load_diabetes)
display(load_digits)
display(load_linnerud)
display(load_wine)
display(load_breast_cancer)

<function sklearn.datasets._base.load_iris(*, return_X_y=False, as_frame=False)>

<function sklearn.datasets._base.load_diabetes(*, return_X_y=False, as_frame=False, scaled=True)>

<function sklearn.datasets._base.load_digits(*, n_class=10, return_X_y=False, as_frame=False)>

<function sklearn.datasets._base.load_linnerud(*, return_X_y=False, as_frame=False)>

<function sklearn.datasets._base.load_wine(*, return_X_y=False, as_frame=False)>

<function sklearn.datasets._base.load_breast_cancer(*, return_X_y=False, as_frame=False)>

###### <b><span style="color:green;">Iris [Classification]

In [17]:
# load_iris(as_frame=True) #Returns the iris dataset as a pandas DataFrame, allowing features and target to be accessed as DataFrame columns. Useful for EDA.
# load_iris(as_frame=False) #Returns the iris dataset as a Bunch (default dictionary-like format) with separate data and target, not in DataFrame form. Suitable for direct processing.
# load_iris(return_X_y=True) #Returns only features (X) and target (y) as two separate numpy arrays, without metadata.
# load_iris(return_X_y=False) #Returns the complete dataset in Bunch format (with metadata like feature and target names) by default if return_X_y is not set to True.
# load_iris()
load_iris(return_X_y=True, as_frame=True)

(     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                  5.1               3.5                1.4               0.2
 1                  4.9               3.0                1.4               0.2
 2                  4.7               3.2                1.3               0.2
 3                  4.6               3.1                1.5               0.2
 4                  5.0               3.6                1.4               0.2
 ..                 ...               ...                ...               ...
 145                6.7               3.0                5.2               2.3
 146                6.3               2.5                5.0               1.9
 147                6.5               3.0                5.2               2.0
 148                6.2               3.4                5.4               2.3
 149                5.9               3.0                5.1               1.8
 
 [150 rows x 4 columns],
 0      0
 1      0
 2   

###### <b><span style="color:green;">Diabetes [Regression]

In [20]:
load_diabetes(return_X_y=True, scaled=False, as_frame=False)

(array([[59.    ,  2.    , 32.1   , ...,  4.    ,  4.8598, 87.    ],
        [48.    ,  1.    , 21.6   , ...,  3.    ,  3.8918, 69.    ],
        [72.    ,  2.    , 30.5   , ...,  4.    ,  4.6728, 85.    ],
        ...,
        [60.    ,  2.    , 24.9   , ...,  3.77  ,  4.1271, 95.    ],
        [36.    ,  1.    , 30.    , ...,  4.79  ,  5.1299, 85.    ],
        [36.    ,  1.    , 19.6   , ...,  3.    ,  4.5951, 92.    ]]),
 array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
        128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
        150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
        200., 252., 113., 143.,  51.,  52., 210.,  65

###### <b><span style="color:green;">Digits [Classification]

In [26]:
# load_digits(n_class=2) #labelnya mau sampe berapa
load_digits(n_class=5, return_X_y=True)

(array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  0., ...,  9.,  0.,  0.],
        [ 0.,  0.,  0., ...,  4.,  0.,  0.],
        [ 0.,  0.,  6., ...,  6.,  0.,  0.]]),
 array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 0, 4, 1, 3, 1, 0,
        0, 2, 2, 2, 0, 1, 2, 3, 3, 3, 3, 4, 4, 1, 0, 2, 2, 0, 0, 1, 3, 2,
        1, 4, 3, 1, 3, 1, 4, 3, 1, 4, 0, 3, 1, 4, 4, 2, 2, 2, 4, 4, 0, 0,
        1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 0, 4, 1, 3, 1, 0, 0,
        2, 2, 2, 0, 1, 2, 3, 3, 3, 3, 4, 4, 1, 0, 2, 2, 0, 0, 1, 3, 2, 1,
        3, 1, 3, 1, 4, 3, 1, 4, 0, 3, 1, 4, 4, 2, 2, 2, 4, 4, 0, 0, 1, 2,
        3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 0, 4, 1, 3, 1, 0, 0, 2, 2,
        2, 0, 1, 2, 3, 3, 3, 3, 4, 4, 1, 0, 2, 2, 0, 0, 1, 3, 2, 1, 4, 3,
        1, 3, 1, 4, 3, 1, 4, 0, 3, 1, 4, 4, 2, 2, 2, 4, 4, 0, 3, 0, 1, 2,
        3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 0,

###### <b><span style="color:green;">Linnerud

In [48]:
load_linnerud(return_X_y=True, as_frame=True)
import pandas as pd
X, y = ((load_linnerud(return_X_y=True, as_frame=True)))
a = pd.concat([X,y], axis=1)
a

Unnamed: 0,Chins,Situps,Jumps,Weight,Waist,Pulse
0,5.0,162.0,60.0,191.0,36.0,50.0
1,2.0,110.0,60.0,189.0,37.0,52.0
2,12.0,101.0,101.0,193.0,38.0,58.0
3,12.0,105.0,37.0,162.0,35.0,62.0
4,13.0,155.0,58.0,189.0,35.0,46.0
5,4.0,101.0,42.0,182.0,36.0,56.0
6,8.0,101.0,38.0,211.0,38.0,56.0
7,6.0,125.0,40.0,167.0,34.0,60.0
8,15.0,200.0,40.0,176.0,31.0,74.0
9,17.0,251.0,250.0,154.0,33.0,56.0


###### <b><span style="color:green;">Wine [Classification]

In [61]:
X, y = load_wine(return_X_y=True, as_frame=True)
a = pd.concat([X,y], axis= 1) #axis 0 = rows; axis 1 = column
a



Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


###### <b><span style="color:green;">Breast Cancer [Classification]

In [68]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
a = pd.concat([X, y], axis=1)
a


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


> Realworld datasets

In [82]:
from sklearn.datasets import fetch_olivetti_faces

In [88]:
from sklearn.datasets import fetch_olivetti_faces
olivetti_faces = fetch_olivetti_faces()
olivetti_faces.data.shape
(400, 4096)
olivetti_faces.target.shape
(400,)
olivetti_faces.images.shape
(400, 64, 64)

error: Error -3 while decompressing data: too many length or distance symbols

>Generated datasets

>Loading other datasets

##### <b><span style="color:#f283b3;">08. Computing with scikit-learn

>##### <b><span style="color:#f283b3;">09. Model persistence

>##### <b><span style="color:#f283b3;">10. Common pitfalls and recommended practices

>##### <b><span style="color:#f283b3;">11. Dispatching

>##### <b><span style="color:#f283b3;">12. Choosing the right estimator

>##### <b><span style="color:#f283b3;">13. External resources, videos and talks