In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV, RidgeCV, ElasticNetCV

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.eval_measures import mse, rmse
from statsmodels.tsa.stattools import acf

from scipy import stats

<IPython.core.display.Javascript object>

Split columns into:
* `encode_cols`
* `binary_cols`
* `numeric_cols`

In [4]:
num_cols = []
encode_cols = []
preprocessing = ColumnTransformer(
    [
        ("scale_nums", StandardScaler(), num_cols),
        ("encode_cats", OneHotEncoder(drop="first", sparse=False), encode_cols),
    ],
    remainder="passthrough",
)

<IPython.core.display.Javascript object>

In [None]:
pipeline = Pipeline([("preprocessing", preprocessing), ("model name", Model())])

In [None]:
grid= {
    'model name + __ + hyperparam': 'some array or list',
    . . . ,
    . . .
}

In [None]:
pipeline_cv= GridSearchCV(pipeline, grid, verbose=1, cv=5)
pipeline_cv.fit(X_train, y_train)
pipeline_cv.best_params_

### Logististic Regression

Hyperparamaters
* `penalty`
    * l1, l2, elasticnet, none, default= l2
* `C`
    * default = 1.0
* `solver`
    * newton-cg, lbfgs, liblinear, sag, saga
    
Model
* `LogisticRegressor()`

### Linear Regression

OLS Regression
* `import statsmodels.api as sm`
* `X = sm.add_constant(X)`
* `results = sm.OLS(Y, X).fit()`

Markov Assumptions
* Linearity
* Error term should be zero on average
* Homoscedasticity
* Low multicollinearity
* Error terms should not be correlated
* Features should not be correlated with errors
* Normality of errors

In [1]:
class Markov:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.model = LinearRegression()
        self.model.fit(X, y)
        self.predictions = self.model.predict(X)
        self.errors = self.y - self.predictions
        self.X_const = sm.add_constant(X)
        self.lm_results = sm.OLS(y, self.X_const).fit()
        self.coefs = self.model.coef_
        self.intercepts = self.model.intercept_

    def plot_linearity(self):
        count = 1
        plt.figure(figsize=(25, 15))
        cols = self.X.columns
        for col in self.X.columns:
            plt.subplot(5, 5, count)
            plt.scatter(self.X[col], self.predictions)
            plt.xlabel(col)
            plt.ylabel("target")
            count += 1

        plt.tight_layout()
        plt.show()

    def plot_homoscedasticity(self):
        plt.scatter(self.predictions, self.errors)
        plt.xlabel("Predicted")
        plt.ylabel("Residual")
        plt.axhline(y=0)
        plt.title("Residual vs. Predicted")
        plt.show()

    def b_pagan(self):
        _, lmp, _, fp = het_breuschpagan(lm_results.resid, X)

        return lmp, fp

    def get_vifs(self):
        vifs = []
        for i in range(self.X_const.shape[1]):
            vif = variance_inflation_factor(self.X_const.values, i)
            vifs.append(vif)

        return pd.Series(vifs, index=self.X_const.columns)

    def plot_errors(self):
        plt.plot(self.errors)
        plt.show()

    def plot_errors_acf(self):
        acf_data = acf(self.errors)

        plt.plot(acf_data[1:])
        plt.show()

    def plot_error_normality(self):
        qqplot(lm_results.resid, line="s")
        plt.show()

        plt.hist(lm_results.resid)
        plt.show()

    def shapiro_wilkes(self):
        return stats.shapiro(self.lm_results.resid)

Regularization
* the process of modifying algorithms in order to lower the generalization gap without sacrificing training performance
* `ridgeregr = Ridge(alpha=10**37)`
    * Ridge regression shrinks parameter estimates, but the estimates never reach exactly 0
* `lasso_cv = LassoCV(alphas=alphas, cv=5)`
    * Works by trying to force small parameter estimates to be equal to zero, effectively dropping them from the model
* `elasticregr = ElasticNet(alpha=10**21, l1_ratio=0.5)`
    * Combination of Ridge and Lasso

### Random Forests

Hyperparamaters
* `max_depth`
    * too high--> overfit
* `max_leaf_nodes`
    * too high--> overfit
* `min_samples_leaf`
    * too low--> overfit
* `n_estimators`
    * too high--> overfit
* `max_features
    * too high--> overfit
* `max_samples`
    * too high--> overfit

In [None]:
# Feature importances
importances = model.best_estimator_.feature_importances_
importances
importance_df = pd.DataFrame({"feat": X_train.columns, "importance": importances})
importance_df = importance_df.sort_values("importance", ascending=False)
importance_df

In [None]:
class Entropy:
    ''' 
    params: 
        df: Categorical dataframe with no nulls
        target: The column name of the target variable of the model
        
    get_wae_d: Returns a dict: {feature: {value: weighted average entropy}}
    
    display_wae: params: Feature to display
                 function: displays weighted average 
                           entropy of each value of each feature
                         
    display_best_questions: Display the best question (smallest 
                            weighted average entropy) of each feature
    '''
    
    def __init__(self, df, target):
        # Initialize class
        self.df = df
        self.target = target
        self.columns = df.drop(columns=[target]).columns
        self.col_values_d = self.get_col_values_d
        self.wae_d=self.get_wae_d
        self.best_questions= {k: min(v,key=v.get) for k,v in self.wae_d().items()}

    def get_col_values_d(self):
        # Returns {feature: [value,value,...], ...}
        col_values_d = dict()
        for col in self.columns:
            col_values_d[col] = tennis[col].unique()

        return col_values_d

    def get_entropy(self,probs):
        # Returns entropy of given probabilities
        entropy = 0
        for prob in probs:
            entropy += -prob * np.log2(prob)

        return entropy

    def get_wae_d(self):
        # Returns {feature: {value: weighted average entropy}, ...}
        entropy_d = dict()
        
        # Iterate through values of each feature
        # Fill a dict of dicts with features, values, and entropy
        for k, v in self.col_values_d().items(): 
            value_dict = dict()
            for value in v:
                yes = self.df[self.df[k] == value]
                no= self.df[self.df[k] != value]
                yes_probs = yes[self.target].value_counts(normalize=True)
                no_probs = no[self.target].value_counts(normalize=True)
                yes_entropy = self.get_entropy(yes_probs)
                no_entropy= self.get_entropy(no_probs)
                no_weight= no.shape[0]/ self.df.shape[0]
                yes_weight= yes.shape[0]/ self.df.shape[0]
                weighted_average_entropy= yes_weight*yes_entropy+no_weight*no_entropy
                value_dict[value] = weighted_average_entropy
            entropy_d[k] = value_dict
        
        return entropy_d
    
    def display_wae(self,column):
        # Display weighted average entropy
        for k,v in self.wae_d().items():
            if k==column:
                print(k,'\n')
                for key,value in v.items():
                    print(key+':',round(value,5),'\n')
                    

    def display_best_questions(self):
        # Display values with smallest weighted average entropy
        for k,v in self.best_questions.items():
            print(k,':',v,'\n')