In [23]:
from sklearn import preprocessing
import numpy as np

# Sklearn Data Loading

After loading the data, one must extract the data in the form of numpy arrays.

## Loading Small Datasets

In [10]:
from sklearn.datasets import load_boston
data_boston = load_boston()
type(data_boston)


sklearn.utils.Bunch

## Loading Larger Datasets with fetch

In [11]:
from sklearn.datasets import fetch_olivetti_faces
data = fetch_olivetti_faces()

"""
fetch_xxx(
          data_home="Path/to/put/data": str,
          shuffle=Boolean to shuffle data: bool,
          random_state=random_seed:int
)

There is also a subset parameter:
a label in {"train", "test", "all"}, default = "train"
determines which dataset will be loaded
"""

downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to C:\Users\Destr\scikit_learn_data


# Sklearn Data Generation

Generates data based on some user defined rules

**make_blobs()**

In [13]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples = 50, centers = 3)

"""
make_blobs(
           n_samples = n:int,
           centers = n_centers:int,
           random_state = random_seed:int
)

"""

In [15]:
# X, y

**make_classification()**

In [19]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_classes=2, random_state=0)

"""
make_classification(
                    n_samples=n:int,
                    n_features=n_features:int,
                    n_informative=n_dimensions:int,
                    n_redundant=n_linear_combinations:int,
                    n_repeated=n_repeated:int,
                    n_classes=n_centers:int,
                    random_state:random_seed:int
)
"""
X

array([[-0.03926799,  0.13191176, -0.21120598, ...,  1.97698901,
         1.02122474, -0.46931074],
       [ 0.77416061,  0.10490717, -0.33281176, ...,  1.2678044 ,
         0.62251914, -1.49026539],
       [-0.0148577 ,  0.67057045, -0.21416666, ..., -0.10486202,
        -0.10169727, -0.45130304],
       ...,
       [ 0.29673317, -0.49610233, -0.86404499, ..., -1.10453952,
         2.01406015,  0.69042902],
       [ 0.08617684,  0.9836362 ,  0.17124355, ...,  2.11564734,
         0.11273794,  1.20985013],
       [-1.58249448, -1.42279491, -0.56430103, ...,  1.26661394,
        -1.31771734,  1.61805427]])

**make_regression()**

In [20]:
from sklearn.datasets import make_regression
X, y = make_regression(n_samples = 100, n_features=100, n_informative=10, n_targets=1, random_state=None)

"""
make_regression(
                n_samples=n:int,,
                n_features=n_features:int,
                n_informative=n_input_dimensions:int,
                n_targets=n_output_dimensions,
                random_state=random_seed:int
)
"""
X

array([[ 0.05229635, -0.47227523, -0.49184875, ...,  0.28330517,
         2.27893698,  0.77107374],
       [-1.71546602, -0.52899053, -0.13432096, ...,  0.79098844,
        -0.57497042, -1.24853478],
       [ 0.39794382, -1.01071817, -0.45174629, ...,  0.56465747,
         0.27185294, -1.09949241],
       ...,
       [-0.51127029,  1.11784208, -1.49637706, ..., -1.57236241,
        -0.00912167, -0.15775453],
       [ 0.65199663, -0.81834373,  0.58859989, ..., -0.12144408,
        -0.97062642,  0.89532737],
       [-1.47344375,  0.98734665,  1.17079991, ...,  0.58065033,
         0.61327341,  0.17125346]])

It is also possible and even common to import datasets from outside of sklearn. Pandas is a common tool to help with this.

# Sklearn Data Preprocessing

**SimpleImputer()** (sklearn)

In [25]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean", copy=True)

"""
SimpleImputer(
              missing_values=missing_classification,
              strategy=filling_strategy:str,
              copy=not_inplace:boolean
)

strategy:{"mean", "median", "most_frequent", "constant"}
if "constant", you must also input a fill_value to indicate which numerical constant to use when filling

API
fit(X): look at X and perform value calculations
transform(X): input calculations to transform X
fit_transform(X): fit then transform X
"""

'\nSimpleImputer(\n              missing_values=missing_classification,\n              strategy=filling_strategy:str,\n              copy=not_inplace:boolean\n)\n\nstrategy:{"mean", "median", "most_frequent", "constant"}\nif "constant", you must also input a fill_value to indicate which numerical constant to use when filling\n\nAPI\nfit(X): look at X and perform value calculations\ntransform(X): input calculations to transform X\nfit_transform(X): fit then transform X\n'

**drop_duplicates()** (pandas)

In [None]:
data.drop_duplicates(subset=["A", "B"], keep="first", inplace=True)

"""
.drop_duplicates(
                 subset=subset:list,
                 keep=drop_strategy:str,
                 inplace=inplace:boolean
)
"""

**CountVectorizer()**

In [None]:
CountVectorizer(input="content", encoding="utf-8", lowercase=True)


"""
CountVectorizer(
                input=file_or_content_or_path:str,
                encoding=encoding:str,
                lowercase=lowercase:boolean
)
"""

After fitting, the counte vectorizer is able to vectorize the input strings and extract the "words" categorized by surrounding whitespace. See this picture![image.png](attachment:image.png)

**StandardScalar**

Standardization helps by rescaling data to a controllable and "regular" format e.g. relative scaling. This makes actually using the data a lot easier, and could potentially speed up the convergence of gradient descent. *The sklearn preprocessing.scale() rescales data to have a mean of 0 and variance of 1.*

In [None]:
preprocessing.scale(X)


**preprocessing.MinMaxScaler()**

Rescales the data linearly to fit within a user-defined upper and lower bound.

**preprocessing.LabelBinarizer()**

Used to binarize labels used for supervised learning. The binarization is done with dummy boolean variables. See this photo:![image.png](attachment:image.png)

**preprocessing.LabelEncoder**

Similar to LabelBinarizer but instead of indicating each input label as a separate vector (or row), LabelEncoder indicates the entire input in one vector, like a vector of dict() keys. As such:![image.png](attachment:image.png)

# Sklearn Models

**model_selection.train_test_split()**

Divide the data into training/testing/validating sets.

In [None]:
train_test_split(
                 test_size=size:int|float (n|ratio),
                 shuffle=shuffle:boolean,
                 random_state=random_seed:int
)

## Model Parameters

Using logistic regression as an example, here is the gist of model parameters within sklearn

In [None]:
logistic_regression = linear_model.LogisticRegression(
    penalty="l2",
    tol=0.0001,
    fit_intercept=True,
    random_state=None,
    solver="lbfgs",
    max_iter=100
)

"""
penalty:str -> some penalty defined within sklearn that is used by models to help the learning process
tol:float -> epsilon for pausing the iterations
fit_intercept:bool -> whether or not to fit an intercept
random_state:int -> random seed
solver:str -> algorithm choice
max_iter:int -> maximum iterations for training
"""


![image.png](attachment:image.png)

**API for sklearn models**

fit(X, y) -> trains the model for f: dim(X) -> dim(Y)
predict(X) -> predict y for X -> f(X)
predict_proba(X) -> used strictly for classification tasks. Returns the estimated probability that X is of a given label
score(X, y) -> scores the model. linear: r^2, kmeans: clustering error, logistic: avg acc