# sklearn
This is a machine learning library.

## Standard Setup

In [290]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; margin-left:350px; }</style>"))
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
pd.set_option( 'display.notebook_repr_html', False)  # render Series and DataFrame as text, not HTML
pd.set_option( 'display.max_column', 10)    # number of columns
pd.set_option( 'display.max_rows', 10)     # number of rows
pd.set_option( 'display.width', 90)        # number of characters per row

## The Library
sklearn **does not automatically import its subpackages**. Therefore all subpakcages must be specifically loaded before use.

In [291]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model    import LinearRegression
from sklearn.pipeline        import make_pipeline
from sklearn.preprocessing   import Imputer
from sklearn.preprocessing   import PolynomialFeatures
from sklearn.metrics         import *

import statsmodels.formula.api as smf

## Data Splitting

### Sample Data
Generate 100 rows of data, with 3x features (X1,X2,X3), and one dependant variable (Y)

In [292]:
n = 100  # number of samples
I = 5  # intercept value
E = np.random.randint( 1,20, n)  # Error
X1 = np.random.randint( 1,n+1, n)
X2 = np.random.randint( 1,n+1, n)
X3 = np.random.randint( 1,n+1, n)
Y = 0.1*X1 + 0.2*X2 + 0.3*X3 + E + I
mydf = pd.DataFrame({
    'Y':Y,
    'X1':X1,
    'X2':X2,
    'X3':X3
})
mydf.shape

(100, 4)

In [293]:
mydf.head()

   X1  X2  X3     Y
0  92  18  33  31.7
1  58  89  34  52.8
2  39  23  80  56.5
3  49  60  60  58.9
4  91  81  64  67.5

### Method 1: Split One Dataframe Into Two (Train & Test)

```
traindf, testdf = train_test_split( df, test_size=, random_state= ) 
 # random_state : seed number (integer), optional
 # test_size    : fraction of 1, 0.2 means 20%
```

In [294]:
traindf, testdf = train_test_split(mydf,test_size=0.2, random_state=25)

In [295]:
print (len(traindf))
print (len(testdf))

80
20


### Method 2: DataFrame in X,Y, split into x_train/test, y_train/test
```
x_train, x_test, y_train, y_test = train_test_split( X,Y, test_size=, random_state= )
 # random_state : seed number (integer), optional
 # test_size    : fraction of 1, 0.2 means 20%
```

**Split DataFrame into X and Y First**

In [296]:
feature_cols = ['X1','X2','X3']
X = mydf[feature_cols]
Y = mydf.Y

**Then Split X/Y into x_train/test, y_train/test**

In [297]:
x_train, x_test, y_train, y_test = train_test_split( X,Y, test_size=0.2, random_state=25)
print (len(x_train))
print (len(x_test))

80
20


## Polynomial Transform
This can be used as part of feature engineering, to introduce new features for data that seems to fit with quadradic model.

### Single Variable

#### Sample Data
Data must be 2-D before polynomial features can be applied. Code below convert 1D array into 2D array.

In [298]:
x = np.array([1, 2, 3, 4, 5])
X = x[:,np.newaxis]
X

array([[1],
       [2],
       [3],
       [4],
       [5]])

#### Degree 1
One Degree means maintain original features. No new features is created.

In [299]:
PolynomialFeatures(degree=1, include_bias=False).fit_transform(X)

array([[ 1.],
       [ 2.],
       [ 3.],
       [ 4.],
       [ 5.]])

#### Degree 2
Degree-1 original   feature:  x  
Degree-2 additional features:  x^2  

In [300]:
PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)

array([[  1.,   1.],
       [  2.,   4.],
       [  3.,   9.],
       [  4.,  16.],
       [  5.,  25.]])

#### Degree 3
Degree-1 original   feature:  x  
Degree-2 additional features:  x^2  
Degree-3 additional features:  x^3

In [301]:
PolynomialFeatures(degree=3, include_bias=False).fit_transform(X)

array([[   1.,    1.,    1.],
       [   2.,    4.,    8.],
       [   3.,    9.,   27.],
       [   4.,   16.,   64.],
       [   5.,   25.,  125.]])

#### Degree 4
Degree-1 original   feature:  x  
Degree-2 additional features:  x^2  
Degree-3 additional features:  x^3  
Degree-3 additional features:  x^4

In [302]:
PolynomialFeatures(degree=4, include_bias=False).fit_transform(X)

array([[   1.,    1.,    1.,    1.],
       [   2.,    4.,    8.,   16.],
       [   3.,    9.,   27.,   81.],
       [   4.,   16.,   64.,  256.],
       [   5.,   25.,  125.,  625.]])

### Two Variables

#### Sample Data

In [303]:
X = pd.DataFrame( {'x1': [1, 2, 3, 4, 5 ],
                   'x2': [6, 7, 8, 9, 10]})
X

   x1  x2
0   1   6
1   2   7
2   3   8
3   4   9
4   5  10

#### Degree 2
```
Degree-1 original   features:  x1,     x2  
Degree-2 additional features:  x1^2,   x2^2,   x1:x2 
```

In [304]:
PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)

array([[   1.,    6.,    1.,    6.,   36.],
       [   2.,    7.,    4.,   14.,   49.],
       [   3.,    8.,    9.,   24.,   64.],
       [   4.,    9.,   16.,   36.,   81.],
       [   5.,   10.,   25.,   50.,  100.]])

#### Degree 3
```
Degree-1 original   features:  x1,       x2  
Degree-2 additional features:  x1^2,     x2^2,   x1:x2 
Degree-3 additional features:  x1^3,     x2^3    x1:x2^2    x2:x1^2
```

In [279]:
PolynomialFeatures(degree=3, include_bias=False).fit_transform(X)

array([[    1.,     6.,     1.,     6.,    36.,     1.,     6.,    36.,
          216.],
       [    2.,     7.,     4.,    14.,    49.,     8.,    28.,    98.,
          343.],
       [    3.,     8.,     9.,    24.,    64.,    27.,    72.,   192.,
          512.],
       [    4.,     9.,    16.,    36.,    81.,    64.,   144.,   324.,
          729.],
       [    5.,    10.,    25.,    50.,   100.,   125.,   250.,   500.,
         1000.]])

## Imputation of Missing Data

### Sample Data

In [280]:
from numpy import nan
X = np.array([[ nan, 0,   3  ],
              [ 3,   7,   9  ],
              [ 3,   5,   2  ],
              [ 4,   nan, 6  ],
              [ 8,   8,   1  ]])

y = np.array([14, 16, -1,  8, -5])

In [281]:
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)
X2

array([[ 4.5,  0. ,  3. ],
       [ 3. ,  7. ,  9. ],
       [ 3. ,  5. ,  2. ],
       [ 4. ,  5. ,  6. ],
       [ 8. ,  8. ,  1. ]])

In [282]:
y

array([14, 16, -1,  8, -5])

## Pipeline

In [312]:
(X,y)

(   x1  x2
 0   1   6
 1   2   7
 2   3   8
 3   4   9
 4   5  10, array([14, 16, -1,  8, -5]))

With any of the preceding examples, it can quickly become tedious to do the transformations by hand, especially if you wish to string together multiple steps. For example, we might want a processing pipeline that looks something like this:

- **Impute** missing values using the mean  
- **Transform** features to quadratic  
- **Fit** a linear regression  

In [306]:
my_model = make_pipeline (
    Imputer(strategy='mean'),
    PolynomialFeatures(degree=2),
    LinearRegression()
)

In [284]:
my_model.fit( X, y) # train the model
print (y)
print (my_model.predict(X))

[14 16 -1  8 -5]
[ 14.  16.  -1.   8.  -5.]


# Linear Regression

## The Library

In [285]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model    import LinearRegression
from sklearn.metrics        import mean_absolute_error, mean_squared_error
import statsmodels.api   as sm

## The Math

## Sample Data

### Data Emulation

In [286]:
n = 200  # number of samples
I = 250  # intercept value
E = np.random.randint( 1,20, n)  # Error
X1 = np.random.randint( 1,n+1, n)
X2 = np.random.randint( 1,n+1, n)
X3 = np.random.randint( 1,n+1, n)
Y = 0.1*X1 + 0.2*X2 + 0.3*X3 + E + I

**Put All Data In pandas DataFrame**

In [287]:
mydf = pd.DataFrame({
    'Y':Y,
    'X1':X1,
    'X2':X2,
    'X3':X3
})
mydf.head()

    X1   X2   X3      Y
0  154  102   81  324.1
1  193    2   68  305.1
2   35  162  177  343.0
3  155  160  110  334.5
4  127  127    6  306.9

### Data Validation

Ensure there is **no col-linearity** among the features used

#### Colleration Check

In [288]:
X.corr()

AttributeError: 'numpy.ndarray' object has no attribute 'corr'

#### Matrix Scatter Plot

In [None]:
pd.plotting.scatter_matrix(X, alpha=0.2, figsize=(6, 6), diagonal='hist');

In [None]:
plt.matshow(X.corr())
plt.xticks(range(len(X.columns)), X.columns)
plt.yticks(range(len(X.columns)), X.columns)
plt.colorbar()
plt.show()

## Modeling (scikit-learn)

### Data Preparation

#### Preparing Features and Dependent Value

In [None]:
feature_cols = ['X1','X2','X3']
X = mydf[feature_cols]
Y = mydf.Y

#### Splitting Data Into Training and Test Sets

In [None]:
trainX,testX,trainY,testY = train_test_split(X,Y,test_size=0.2)

### Create The Model

In [None]:
lm = LinearRegression()   # create linear regression object
lm.fit( trainX, trainY )  # train the model using training set

### Analyze The Model
#### Intercept

In [None]:
lm.intercept_

#### Coef

In [None]:
lm.coef_

In [None]:
pd.DataFrame(list(zip(X.columns, lm.coef_)), columns=['features','coef'] )

#### R-Squared

In [None]:
predTrain = lm.predict( trainX )

In [None]:
r2_score( trainY, predTrain )

### Model Performance
#### Run Prediction On Train Data

In [None]:
trainPred = lm.predict( trainX )

#### Mean Absolute Error (MAE)

In [None]:
mean_absolute_error( trainY, trainPred )

#### Mean Squared Error (MSE)

In [None]:
mean_squared_error( trainY, trainPred )

#### Root Mean Squared Error (RMSE)

In [None]:
math.sqrt( mean_squared_error( trainY, trainPred ) )

### Test Prediction

In [None]:
testPred = lm.predict( testX )

In [None]:
mean_absolute_error( testY, testPred )

In [None]:
mean_squared_error( testY, testPred )

In [None]:
math.sqrt( mean_squared_error( testY, testPred ) )

## Modeling (statsmodel)

### Data Preparation
Splitting data into training set and testing set.

In [None]:
traindf, testdf = train_test_split(mydf, test_size=0.2)
trainX,testX,trainY,testY = train_test_split(X,Y,test_size=0.2)

### Create The Model - Equation Method

In [None]:
fit = smf.ols(formula='Y ~ X1 + X2 + X3', data=traindf).fit()

#### Analyze The Model

In [None]:
print (fit.summary())

### Create The Model - Array Method
**intercept** is not included in OLS modeling by default. Hence need to use add_constant() to training dataset in order to display intercept estimate.

In [None]:
trainX = sm.add_constant(trainX)     # this add new column of all value 1
fit2 = smf.OLS(trainY, trainX).fit()

#### Analyze The Model

In [None]:
print (fit2.summary())

In [None]:
rng = np.random.RandomState(1)
x = 10 * rng.rand(50)
2*x+1

## Examples

### Example 1 - Linear Regression

#### Sample Data
Plot looks like polynomial.

In [None]:
x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
plt.scatter(x, y);

#### Built The Model

**Prepare The Data**  
X needs to be at least 2D. Increase the dimension with newaxis

In [None]:
X = x[:, np.newaxis]
X

**Fit and Predict**

In [None]:
fit = LinearRegression().fit(X, y)
pred = fit.predict(X)
plt.scatter(x, y)
plt.plot(x,pred)

### Example 2 - Linear Regression with Polynomial Basis Functions

#### Sample Data

In [None]:
x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
plt.scatter(x, y);

#### Clean Method - Use Pipeline
This method **avoid manually** creating engineered features.

In [None]:
poly_model = make_pipeline( PolynomialFeatures(3), LinearRegression())
poly_model.fit( X,y)
pred2 = poly_model.predict(X)
## plot
plt.scatter(x, y)
plt.plot(x,pred2)

#### Alternative Method - Use Transform
This method involve create a PolynomialFeatures object, transform original data (X) with more engineered features according to degree chosen.

In [None]:
poly = PolynomialFeatures(degree=3, include_bias=False)
X2 = poly.fit_transform(X)
fit2 = LinearRegression().fit(X2, y)
pred2 = fit2.predict(X2)
## plot
plt.scatter(x, y)
plt.plot(x,pred2)

## Feature Selection

The advantage of using skcikit-learn package is that it has this particular method selection, works more or less like backward selection (not exactly), and is called **Recursive Feature**. How it works:

- Model run with all variables, weight is assigned to each variable
- Variable with smallest weight will be pruned from next iteration
- Run the model again till the number of desired features is left

### The Library

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

In [None]:
estimator = SVR(kernel='linear')       # we are using linear model
selector = RFE (estimator, 2, step=1)  # we want just 2 features
selector = selector.fit(X,Y)           # execute

In [None]:
selector.support_

In [None]:
selector.ranking_

In [None]:
selector.estimator_