### Overview Function

In [2]:
def cat_num_cols(data, top_val_lim=0):
    '''
    Find and returns the categorical and numerical variables. Pring out the \
    top unique values of the categorical variables.
    
    Parameters:
    data (pandas df): the data
    top_val_lim (int): how many top unique values to show for every categorical \
    variables
    
    Returns:
    cat_cols: names of the categorical variables
    num_cols: names of the numerical variables
    '''
    # Numerical and Categorical variables
    cat_cols = data.select_dtypes(include=['object']).columns.tolist()
    num_cols = data.select_dtypes(exclude=['object']).columns.tolist()
    print("\nCategorical variables:\n", cat_cols, "\n")
    print("Numerical variables:\n", num_cols)
    
    if top_val_lim != 0:
        print("\nTop", top_val_lim, "unique value counts for Categorical variables:")
        for i in cat_cols:   
            print("---", i, "---")
            print(data[i].value_counts()[:top_val_lim], "\n")
        
    return cat_cols, num_cols

### EDA

### Preposessing Function

#### Missing value

In [15]:
def missing_check(data, show_obs=0, feat_show_zero=True):
    '''
    Print percentage missing for each feature, and the value counts of the number of features that observation are missing
    
    Parameters:
    data (pandas df): data
    show_obs: default 0 means print nothing. Print out observations with show_obs or more missing features.
    feat_show_zero: whether showing features that doesn't have missing values
    
    Returns:
    miss_obs (pd series): Value counts of the number of features that observation are missing
    missing_feat (pd series): Feature with missing values and the percentage missing
    '''
    # Percentage missing for each feature
    missing_feat = data.isnull().sum()/data.isnull().count()
    if feat_show_zero == False:
        missing_feat = missing_feat[missing_feat!=0]

    print("Percentage of data missing for each feature:")
    print(missing_feat.sort_values(ascending=False), "\n")

    # Percentage missing for each observation
    feature_num = len(data.columns)
    miss_obs = data.isnull().sum(axis=1)
    print("Count of the observations with one or more (#) missing features:")
    print("#    Count")
    print(miss_obs.value_counts().sort_index(), "\n")
    
    if show_obs != 0:
        print("Observation with ", show_obs, "or more missing features:")
        print(data.iloc[miss_obs[miss_obs >= show_obs].index.tolist(),:].to_string())

    return miss_obs, missing_feat


def impute_cat_var(data):
    '''
    Impute categorical variable of the data
    
    Parameter:
    data (pandas df): data
    
    Returns:
    None
    '''
    cat_cols = data.select_dtypes(include=['object']).columns.tolist()
    missing_feat = data.isnull().sum()/data.isnull().count()
    missing_feat = missing_feat[missing_feat!=0]
    miss_obs = data.isnull().sum(axis=1)
    miss_cat_cols=set(cat_cols).intersection(missing_feat.index.tolist())

    for i in miss_cat_cols:
        replace_val = data[i].mode().item()
        data[i].fillna(replace_val, inplace=True)

        
def impute_num_var(data, mean_impute=False):
    '''
    Impute categorical variable of the data
    
    Parameter:
    data (pandas df): data
    mean_impute (Bool): whether impute with mean. Default is false (with median)
    
    Returns:
    None
    '''
    num_cols = data.select_dtypes(exclude=['object']).columns.tolist()
    missing_feat = data.isnull().sum()/data.isnull().count()
    missing_feat = missing_feat[missing_feat!=0]
    miss_obs = data.isnull().sum(axis=1)
    miss_num_cols=set(num_cols).intersection(missing_feat.index.tolist())

    for i in miss_num_cols:
        replace_val = data[i].median()
        if mean_impute == True:
            replace_val = data[i].mean()
        data[i].fillna(replace_val, inplace=True)

In [19]:
def cap_outliers(data, cols, method="zscore", cap=False, 
                    quatile = 0.01, percent=0.01, plot=True):
    '''
    Cap outliers
    
    Parameter:
    data (pandas df): data
    cols (list): columns to cap
    method (string): zscore, iqr, or winsor
    cap (Bool): whether to cap or not
    quantile (int): quantile for winsor
    percent (int): percentage for z-score
    
    Returns:
    None
    '''
    if method == 'zscore':
        z_score = stats.norm.ppf(1-percent)
        for i in cols:
            high = data[i].mean() + z_score*data[i].std()
            low = data[i].mean() - z_score*data[i].std()
            
            # Draw plot if needed
            if plot:
                ax = sns.distplot(data[i])
                sns.set_theme(style="whitegrid")
                ax.set(title="Distribution Plot for {}".format(i))
                ax.axvline(low, linestyle="--", color="blue")
                ax.axvline(high, linestyle="--", color="blue")
                ax.axvspan(low, high, color='blue', alpha=0.1, lw=0)
                plt.show()
            
            # Cap if selected
            if cap:  
                data[i] = np.select([data[i] > high, data[i] < low], 
                                    [high, low], data[i])
                print("Capped!")
                
    elif method =='iqr':
        for i in cols:
            per25 = data[i].quantile(0.25)
            per75 = data[i].quantile(0.75)
            iqr = per75 - per25
            high = per75 + 1.5 * iqr
            low = per25 - 1.5 * iqr
            
            # Draw plot if needed
            if plot:
                ax = sns.boxplot(data[i])
                sns.set_theme(style="whitegrid")
                ax.set(title="Boxplot for {}".format(i))
                ax.axvline(low, linestyle="--", color="blue")
                ax.axvline(high, linestyle="--", color="blue")
                ax.axvspan(low, high, color='blue', alpha=0.1, lw=0)
                plt.show()
            
            # Cap if selected
            if cap:  
                data[i] = np.select([data[i] > high, data[i] < low], 
                                    [high, low], data[i])
                print("Capped!")
                
    elif method =='winsor':
        for i in cols:
            high = data[i].quantile(1-quatile)
            low = data[i].quantile(quatile)
            
            # Draw plot if needed
            if plot:
                ax = sns.boxplot(data[i])
                sns.set_theme(style="whitegrid")
                ax.set(title="Boxplot for {}".format(i))
                ax.axvline(low, linestyle="--", color="blue")
                ax.axvline(high, linestyle="--", color="blue")
                ax.axvspan(low, high, color='blue', alpha=0.1, lw=0)
                plt.show()
            
            # Cap if selected
            if cap:  
                data[i] = np.select([data[i] > high, data[i] < low], 
                                    [high, low], data[i])
                print("Capped!")
                
    else:
        print("Wrong entry for method. Select one in (zscore, iqr, winsor)")

### Linear Model Assumption Check Function

In [None]:
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, skew
from matplotlib import pyplot as plt

In [None]:
def get_vif_factors(X):
    '''
    Check VIF of independent variables
    
    Parameters:
    X (2D array): the independent variables
    
    Returns
    vif_factors: the VIF of the independent variables
    '''
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    X_matrix = X.values
    vif = [variance_inflation_factor(X_matrix,i) for i in range(X_matrix.shape[1])]
    vif_factors = pd.DataFrame()
    vif_factors["column"] = X.columns
    vif_factors["VIF"] = vif
    return vif_factors

In [None]:
def lin_assum_check(resid, X, ts=False):
    '''
    Check the linear model assumptions.
    
    Parameters:
    resid (1D array): the residual of the model. Y-Y_fitted
    X (2D array): the values of all independent variables
    ts (Boolean): whether the data is time series data. Default is False
    
    Returns:
    None
    '''
    # Set up the canvas
    fig, axes = plt.subplots(nrows=2, figsize=(12,12))
    plt.subplots_adjust(hspace=0.3, wspace = 0.5)
    
    # 1. Linear relationship: Residual vs. y fitted value
    axes[0].scatter(y_pred, resid)
    axes[0].grid()
    axes[0].set_title('Residual vs. Y_Predicted')
    axes[0].set_xlabel('Y_Predicted')
    axes[0].set_ylabel('Residual')
    
    # 2. Residuals are Normally Distributed: QQ plot of residual
    stats.probplot(resid, plot=axes[1])
    
    # 3. variance of errors is constant: Test for heteroscedasticity. 
    # p-val < 0.5 -> Error variances are not equal
    from statsmodels.stats.diagnostic import het_breuschpagan
    bptest = het_breuschpagan(resid, X)[1]
    print("Heteroscedasticity Test ------------------------\n")
    print("The p value of Breuchpagen test is ", bptest, ".\n")
    if bptest < 0.05:
        print("Data has heteroscedasticity.\n")
    else:
        print("Data has homoscedasticity.\n")
    
    # 4. No multicollinearity: VIF factors for testing multicollinearity
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    X_matrix = X.values
    vif = [variance_inflation_factor(X_matrix,i) for i in range(X_matrix.shape[1])]
    vif_factors = pd.DataFrame()
    vif_factors["column"] = X.columns
    vif_factors["VIF"] = vif
    
    print("VIF---------------------------------------------\n")
    print(vif_factors)
    print("\n(Usually a variable with VIF greater than 10 is considered to be troublesome.)\n")
    print("------------------------------------------------\n")
    
    # 5. No autocorrelation of Errors: Durbin Watson test - No need for non-time series data
    if ts==True:
        from statsmodels.stats.stattools import durbin_watson
        print(durbin_watson(resid))
        print(("A value between 1.8 and 2.2 indicates no autocorrelation. "
              "A value less than 1.8 indicates positive autocorrelation and a value greater" 
              " than 2.2 indicates negative autocorrelation"))