In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
help(LabelEncoder)

Help on class LabelEncoder in module sklearn.preprocessing.label:

class LabelEncoder(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Encode labels with value between 0 and n_classes-1.
 |  
 |  Read more in the :ref:`User Guide <preprocessing_targets>`.
 |  
 |  Attributes
 |  ----------
 |  classes_ : array of shape (n_class,)
 |      Holds the label for each class.
 |  
 |  Examples
 |  --------
 |  `LabelEncoder` can be used to normalize labels.
 |  
 |  >>> from sklearn import preprocessing
 |  >>> le = preprocessing.LabelEncoder()
 |  >>> le.fit([1, 2, 2, 6])
 |  LabelEncoder()
 |  >>> le.classes_
 |  array([1, 2, 6])
 |  >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
 |  array([0, 0, 1, 2]...)
 |  >>> le.inverse_transform([0, 0, 1, 2])
 |  array([1, 1, 2, 6])
 |  
 |  It can also be used to transform non-numerical labels (as long as they are
 |  hashable and comparable) to numerical labels.
 |  
 |  >>> le = preprocessing.LabelEncoder()
 |  >>> le.fit(["paris"

In [8]:
import numpy as np
np.array([1, 2, 2, 6]).shape

(4,)

In [11]:
from sklearn.preprocessing import LabelEncoder

X = [1, 2, 2, 6]
label_encoder = LabelEncoder()
label_encoder.fit(X)

LabelEncoder()

In [12]:
label_encoder.classes_

array([1, 2, 6])

In [13]:
label_encoder.transform([1,1,2,6])

array([0, 0, 1, 2], dtype=int64)

In [14]:
label_encoder.inverse_transform([0, 0, 1, 2])

  if diff:


array([1, 1, 2, 6])

In [15]:
label_encoder.fit_transform(X)

array([0, 1, 1, 2], dtype=int64)

In [17]:
label_encoder.classes_

array([1, 2, 6])

In [18]:
label_encoder.inverse_transform(label_encoder.fit_transform(X))

  if diff:


array([1, 2, 2, 6])

In [21]:
X = ["paris", "paris", "tokyo", "amsterdam"]
integer_encoded = label_encoder.fit_transform(X)
print(label_encoder.classes_)
print(integer_encoded)

['amsterdam' 'paris' 'tokyo']
[1 1 2 0]


In [22]:
label_encoder.inverse_transform(integer_encoded)

  if diff:


array(['paris', 'paris', 'tokyo', 'amsterdam'], dtype='<U9')

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
help(OneHotEncoder)

Help on class OneHotEncoder in module sklearn.preprocessing.data:

class OneHotEncoder(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Encode categorical integer features using a one-hot aka one-of-K scheme.
 |  
 |  The input to this transformer should be a matrix of integers, denoting
 |  the values taken on by categorical (discrete) features. The output will be
 |  a sparse matrix where each column corresponds to one possible value of one
 |  feature. It is assumed that input features take on values in the range
 |  [0, n_values).
 |  
 |  This encoding is needed for feeding categorical data to many scikit-learn
 |  estimators, notably linear models and SVMs with the standard kernels.
 |  
 |  Note: a one-hot encoding of y labels should use a LabelBinarizer
 |  instead.
 |  
 |  Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
 |  
 |  Parameters
 |  ----------
 |  n_values : 'auto', int or array of ints
 |      Number of values per feature.
 |

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [47]:
integer_encoded = [
                    [0, 0, 3], 
                    [1, 1, 0], 
                    [0, 2, 1], 
                    [1, 0, 2]
                  ]
one_hot_encoder = OneHotEncoder()


In [43]:
one_hot_encoder.fit(integer_encoded)
one_hot_encoder.n_values_

array([2, 3, 4])

In [44]:
one_hot_encoder.feature_indices_

array([0, 2, 5, 9], dtype=int32)

In [45]:
one_hot_encoder.transform(integer_encoded).toarray()

array([[1., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 1., 0.]])

In [54]:
onehot_encoded = one_hot_encoder.fit_transform(integer_encoded)
print(type(onehot_encoded))

<class 'scipy.sparse.csr.csr_matrix'>


In [56]:
one_hot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = one_hot_encoder.fit_transform(integer_encoded)
print(type(onehot_encoded))
print(onehot_encoded)

<class 'numpy.ndarray'>
[[1. 0. 1. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 1. 0.]]


In [100]:
from sklearn.preprocessing import OneHotEncoder
integer_encoded = [[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]]
integer_encoded = np.array(integer_encoded)
type(integer_encoded)

numpy.ndarray

In [101]:
integer_encoded

array([[0.00000000e+00, 4.40000000e+01, 7.20000000e+04],
       [2.00000000e+00, 2.70000000e+01, 4.80000000e+04],
       [1.00000000e+00, 3.00000000e+01, 5.40000000e+04],
       [2.00000000e+00, 3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 4.00000000e+01, 6.37777778e+04],
       [0.00000000e+00, 3.50000000e+01, 5.80000000e+04],
       [2.00000000e+00, 3.87777778e+01, 5.20000000e+04],
       [0.00000000e+00, 4.80000000e+01, 7.90000000e+04],
       [1.00000000e+00, 5.00000000e+01, 8.30000000e+04],
       [0.00000000e+00, 3.70000000e+01, 6.70000000e+04]])

In [102]:
onehotEencoder = OneHotEncoder(categorical_features= [0])
print(onehot_encoder)
integer_encoded = onehotEencoder.fit_transform(integer_encoded)
onehotEencoder.n_values_

OneHotEncoder(categorical_features=[0], dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)


array([3])

In [103]:
integer_encoded

<10x5 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in COOrdinate format>

In [104]:
integer_encoded.toarray()

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [105]:
import pandas as pd

In [106]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers:

read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True, delim_whitespace=False, as_recarray=None, compact_ints=None, use_unsigned=None, low_memory=True, buffer_lines=None, memory_map=False, float_precision=None)


In [107]:
help(pd.read_table)

Help on function read_table in module pandas.io.parsers:

read_table(filepath_or_buffer, sep='\t', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True, delim_whitespace=False, as_recarray=None, compact_ints=None, use_unsigned=None, low_memory=True, buffer_lines=None, memory_map=False, float_precision=N

In [110]:
from sklearn.model_selection import train_test_split


In [111]:
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, **options)
    Split arrays or matrices into random train and test subsets
    
    Quick utility that wraps input validation and
    ``next(ShuffleSplit().split(X, y))`` and application to input data
    into a single call for splitting (and optionally subsampling) data in a
    oneliner.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    
    test_size : float, int, None, optional
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. By default, the value is set 

In [112]:
from sklearn.preprocessing import StandardScaler

In [113]:
help(StandardScaler)

Help on class StandardScaler in module sklearn.preprocessing.data:

class StandardScaler(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Standardize features by removing the mean and scaling to unit variance
 |  
 |  Centering and scaling happen independently on each feature by computing
 |  the relevant statistics on the samples in the training set. Mean and
 |  standard deviation are then stored to be used on later data using the
 |  `transform` method.
 |  
 |  Standardization of a dataset is a common requirement for many
 |  machine learning estimators: they might behave badly if the
 |  individual feature do not more or less look like standard normally
 |  distributed data (e.g. Gaussian with 0 mean and unit variance).
 |  
 |  For instance many elements used in the objective function of
 |  a learning algorithm (such as the RBF kernel of Support Vector
 |  Machines or the L1 and L2 regularizers of linear models) assume that
 |  all features are centered around 0 an

In [116]:
from sklearn.preprocessing import StandardScaler
data = [[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 4.00000000e+01, 6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.70000000e+01, 6.70000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 2.70000000e+01, 4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 3.87777778e+01, 5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.80000000e+01, 7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.40000000e+01, 7.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.50000000e+01, 5.80000000e+04]]
scaler = StandardScaler()
print(scaler.fit(data))
print(scaler.mean_)
print(scaler.transform(data))

StandardScaler(copy=True, with_mean=True, with_std=True)
[5.00000000e-01 5.00000000e-01 5.00000000e-01 1.25000000e-01
 3.75000000e-01 3.84722222e+01 6.25972222e+04]
[[-1.          1.         -1.          2.64575131 -0.77459667  0.26306757
   0.12381479]
 [ 1.         -1.          1.         -0.37796447 -0.77459667 -0.25350148
   0.46175632]
 [-1.          1.         -1.         -0.37796447  1.29099445 -1.97539832
  -1.53093341]
 [-1.          1.         -1.         -0.37796447  1.29099445  0.05261352
  -1.11141978]
 [ 1.         -1.          1.         -0.37796447 -0.77459667  1.64058505
   1.7202972 ]
 [-1.          1.         -1.         -0.37796447  1.29099445 -0.0813118
  -0.16751412]
 [ 1.         -1.          1.         -0.37796447 -0.77459667  0.95182631
   0.98614835]
 [ 1.         -1.          1.         -0.37796447 -0.77459667 -0.59788085
  -0.48214934]]


In [117]:
scaler.inverse_transform(scaler.transform(data))

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 4.00000000e+01, 6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.70000000e+01, 6.70000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 2.70000000e+01, 4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 3.87777778e+01, 5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.80000000e+01, 7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.40000000e+01, 7.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3

In [118]:
from sklearn.preprocessing import MinMaxScaler

In [119]:
help(MinMaxScaler)

Help on class MinMaxScaler in module sklearn.preprocessing.data:

class MinMaxScaler(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Transforms features by scaling each feature to a given range.
 |  
 |  This estimator scales and translates each feature individually such
 |  that it is in the given range on the training set, i.e. between
 |  zero and one.
 |  
 |  The transformation is given by::
 |  
 |      X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
 |      X_scaled = X_std * (max - min) + min
 |  
 |  where min, max = feature_range.
 |  
 |  This transformation is often used as an alternative to zero mean,
 |  unit variance scaling.
 |  
 |  Read more in the :ref:`User Guide <preprocessing_scaler>`.
 |  
 |  Parameters
 |  ----------
 |  feature_range : tuple (min, max), default=(0, 1)
 |      Desired range of transformed data.
 |  
 |  copy : boolean, optional, default True
 |      Set to False to perform inplace row normalization and avoid a
 |   

In [120]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
print(scaler.fit(data))

MinMaxScaler(copy=True, feature_range=(0, 1))


In [126]:
print(scaler.data_min_)
print(scaler.data_max_)

[-1.  2.]
[ 1. 18.]


In [122]:
scaler.transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [125]:
scaler.transform([[2,2]])

array([[1.5, 0. ]])

In [127]:
scaler.inverse_transform(scaler.transform([[2,2]]))

array([[2., 2.]])