## ML Model Data Pipeline
Created by Vincent Lao.

### Pipeline:
clean data $\rightarrow$ `preprocess_data()` $\rightarrow$ `run_model()`  
$\hspace{4cm} or$  
clean data $\rightarrow$ `preprocess_data()` $\rightarrow$ `linreg_kfold_cv()`  
$\hspace{4cm} or$  
clean data $\rightarrow$ `preprocess_data()` $\rightarrow$ `tree_kfold_cv()`   
$\hspace{4cm} or$  
clean data $\rightarrow$ `preprocess_data()` $\rightarrow$ `forward_selection()`   

### Function of Pipeline:

Given a dataset, y variable column name, and a defined model (e.g. LinearRegression()), do the following:  
1. Split the data into training and testing data.
2. Train the model on the training set, and predict on the test set.
3. Calculate the MSE, and plot some diagnostics.

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.preprocessing import normalize

# !pip install mlxtend
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from scipy.stats import randint

In [45]:
def preprocess_data(data, y_col, test_size=0.2, random_state=100, standardize_cols = [None]):
    """
    This function takes a dataset and splits it up into training and testing data, 
    as well as features and response variable.
    
    Input: data,             a Pandas dataframe
           y_col,            a string that is the name of the response variable
           test_size,        a float between 0 and 1 indicating the fraction of the data to include in the test split
           random_state,     an integer, used to define the random state
           standardize_cols, a list of columns you would like to standardize.
           
    Output: X_train, 2D array of the training data feature matrix
           X_test,  1D array of the training data response variable
           y_train, 2D array of the testing data feature matrix
           y_test,  1D array of the testing data response variable
    """
    
    if any(standardize_cols):
        
        print('Standardizing data...')
        data = data.copy().reset_index().drop(columns='index')
        
        standardize_features = normalize(data.loc[:, standardize_cols], axis=0)
        standardize_df = pd.DataFrame(standardize_features, columns=standardize_cols)
        
        data.drop(standardize_cols, axis = 1, inplace = True)
        data = pd.concat([data, standardize_df], axis= 1)
    
    print('Splitting data...')
    X = data.loc[:, data.columns != y_col]
    y = data[y_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, \
                                                            test_size=test_size, \
                                                            random_state=random_state)
    return X_train, X_test, y_train, y_test



def run_model(X_train, X_test, y_train, y_test, model, diagnostics = False):
    """
    This function takes in data that has been split by scikit-learn's test_train_split
    and a model that has been initialized, and fits the model and calculates the MSE on the training & testing data.
    
    Input: X_train, X_test, y_train, y_test, the output of preprocess_data()
    
           model, an initialized scikit-learn model, i.e. LinearRegression, Ridge(), Lasso(), 
                                                          DecisionTreeRegressor(), RandomForestRegressor
           diagnostics, a boolean indicating if you would like to plot linear regression diagnostic plots
           
    Ouput: printed R^2 and MSE values of both the training and testing sets
           model, the fitted model
    """

    # fit model with scikit-learn
    print('Fitting Model...')
    model.fit(X_train, y_train)

    preds_train = model.predict(X_train)
    mse_train = mean_squared_error(preds_train, y_train.values)
    print('[Train MSE:', str(np.round(mse_train, 4)) + ']')
    
    r_2 = model.score(X_train, y_train)
    print('[Train R^2:', str(r_2) + ']')
    
    # evaluate model
    preds_test = model.predict(X_test)
    mse_test = mean_squared_error(preds_test, y_test.values)
    print('[Test MSE:', str(np.round(mse_test, 4)) + ']')
    
    r_2 = model.score(X_test, y_test)
    print('[Test R^2:', str(r_2) + ']')
    
    # plot any diagonostics that are in the 
    if diagnostics:
        summarize_diagnostics(preds_test, y_test)

    print('----FINISHED----')
    
    return model



def summarize_diagnostics(preds, y_test):
    
    df = pd.DataFrame({'preds' : preds, 
                       'y_test' : y_test, 
                       'resids' : y_test - preds}).sort_values('preds')

    # plot residuals
    plt.subplots(figsize=(10, 4))
    
    plt.subplot(121)
    plt.title('Fitted Values vs. Residuals')
    plt.scatter(df['preds'], df['resids'], color='blue')
    plt.ylabel('Residual')
    plt.xlabel('Fitted Values')
    
    plt.subplot(122)
    plt.title('Fitted Values vs. True Values')
    plt.scatter(df['preds'], df['y_test'], color='blue')
    plt.ylabel('True Values')
    plt.xlabel('Fitted Values')
    
    plt.tight_layout()

    # save plot to file
    #  plt.savefig('../../visualizations/diagnostic_plot.png')
    #  plt.close()

### Cross Validation

In [26]:
def linreg_kfold_cv(ModelCV, X_train, X_test, y_train, y_test, alphas, n_splits = 5, random_state=100):
    """
    This function takes either RidgeCV or LassoCV, and performs cross validation on the input data.
    Be careful to standardize the input data beforehand!
    
    Input: ModelCV,      either RidgeCV or LassoCV (before initializing it).
           X_train,      X_test, y_train, y_test, output from preprocess_data()
           alphas,       a list or array of alpha values you would like to test
           n_splits,     number of folds you would like for your cv
           random_state, seed of the cross-validation for reproducibility
           
    Output: print statements of the optimal alpha value + test set MSE
            alpha_opt, the chosen optimal alpha value on the input data
    """
    
    kf = KFold(n_splits = n_splits, shuffle=True, random_state=random_state)
    cv = ModelCV(cv = kf, alphas = alphas)
    cv.fit(X_train, y_train)
    
    alpha_opt = cv.alpha_
    print("optimal alpha:", alpha_opt)
    
    y_pred_cv = cv.predict(X_test)
    
    cv_mse = mean_squared_error(y_test, y_pred_cv)
    print("Test MSE with cross-validated", re.findall(r'\w{5}CV', str(ModelCV))[0] + ":", cv_mse)
    
    r_2 = cv.score(X_test, y_test)
    print('[R^2:', str(r_2) + ']')
    
    return alpha_opt

In [29]:
def tree_kfold_cv(tree, param_dist, X_train, y_train, n_splits = 5, n_iter = 200, random_state=100):
    """
    This function takes an *instantiated* Decision Tree/Random Forest, and performs cross validation on the input data.
    
    Input: tree,             a DecisionTreeRegressor/RandomForestRegressor (after instantiation)
           param_dict,       a dictionary of parameter values you would like to test
           X_train, y_train, output from preprocess_data()
           n_splits,         number of folds you would like for your cv
           n_iter,           number of samples of parameter settings (more = slower, but chance for better model)
           random_state,     seed of the cross-validation for reproducibility
           
    Output: print statements of the optimal parameter values + training set MSE
    """

    rnd_search = RandomizedSearchCV(tree, param_distributions=param_dist, 
                                    cv=n_splits, n_iter=n_iter, random_state = random_state)
    rnd_search.fit(X_train, y_train)
    
    print(rnd_search.best_score_)
    print(rnd_search.best_params_)

#### Example param dists for decision trees and random forests

Make sure to instantiate the `DecisionTreeRegressor` or `RandomForestRegressor` with the same variables as those you have in the `param_dist`.

In [30]:
from scipy.stats import randint
tree_param_dist = {'max_leaf_nodes': randint(3, 100),
                  'max_features': randint(2, 25),
                  'max_depth': randint(1, 10)}

forest_param_dist = {'max_leaf_nodes': randint(3, 100),
                  'max_features': randint(2, 25),
                  'max_depth': randint(1, 10),
                  'n_estimators': randint(50, 200)}

### Feature Selection

In [37]:
def forward_selection(model, X_train, y_train, n_splits=5, k_features=None):
    """
    This function takes an *instantiated* Decision Tree/Random Forest, and chooses the best set of k features
    to fit your data using forward selection.
    
    Input: model,            any sklearn model *after* instantiation
           X_train, y_train, output from preprocess_data()
           n_splits,         number of folds you would like for your cv
           k_features,       number of features you would like your model to have
           
    Output: list of chosen columns of X_train
    """
    
    sfs = SFS(model, k_features = k_features, cv = n_splits, forward=True)
    sfs.fit(X_train, y_train)
    
    return sfs.k_feature_names_

In [None]:
# example of how to run the model on different number of features
# m = LinearRegression()  
# for i in range(1,6):  
#     print(forward_selection(m, X_train, y_train, n_splits=5, k_features=i))  