# Abstract
This example is derived from Udemy Machine Learning A-Z course. It is intended to show how to eliminate unsignificant features in Multiple Linear Regression, based on *P-value* and *adjusted R Square*. 

*P-value* and *adjusted R Square* are calculated using the **statsmodel** python library.


P Value is useful to decide which features have significant influence on prediction results. But for those features which have P value close to SL, it is sometimes tricky to decide to keep or not. Thus, an improved method to consider also the adjusted R square is introduced. Ajust R square indicates the "Goodness of Fitting". If removing the feature which has slightly higher P value than threshold does not help to improve the ajusted R square, it is better not to remove such features 


In [None]:
# Allow multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.core import datetools
import statsmodels.api as sm

# Set the numpy print format
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

In [None]:
def backwardEliminationWithPvalueOnly(X, y, SL):
    """
    X is feature column(s)
    y in the dependent variable
    SL is the significant level threshold, feature with p-value higher than SL should be abandoned
    """
    num_of_features = X.shape[1]
    for i in range(num_of_features):
        regressor_OLS = sm.OLS(y, X).fit()
        print (regressor_OLS.summary())
        max_pvalue = max(regressor_OLS.pvalues).astype(float)
        max_pvalue_pos = np.argmax(regressor_OLS.pvalues)
        if max_pvalue > SL:
            X = np.delete(X, max_pvalue_pos, axis=1) # delete the j column with max pvalue
        else:
            break
    return X

In [None]:
def backwardEliminationWithRsquare(X, y, SL):
    """
    X is feature column(s)
    y in the dependent variable
    SL is the significant level threshold, feature with p-value higher than SL should be abandoned
    """
    num_of_features = X.shape[1]
    for i in range(num_of_features):
        regressor_OLS = sm.OLS(y, X).fit()
        print (regressor_OLS.summary())
        max_pvalue = max(regressor_OLS.pvalues).astype(float)
        max_pvalue_pos = np.argmax(regressor_OLS.pvalues)
        _adjRSqare_before = regressor_OLS.rsquared_adj.astype(float)
        if max_pvalue > SL:
            # Check with a copy if the adjusted R square will improve after removing the feature
            # if imporoves, delete the real feature, otherwise continue
            X_copy =X.copy()
            X_copy = np.delete(X_copy, max_pvalue_pos, axis=1)
            regressor_temp = sm.OLS(y, X_copy).fit()
            _adjRSqare_after = regressor_temp.rsquared_adj.astype(float)
            
            if  _adjRSqare_after > _adjRSqare_before:
                X = np.delete(X, max_pvalue_pos, axis=1) # delete the j column with max pvalue
        else:
            # If the max_pvalue is less than threslhold, all significant features are found             
            break
    return X

In [None]:
# Define the Backward Elimination logic

def backwardElimination(X, y, SL):
    """
    X is feature column(s)
    y in the dependent variable
    SL is the significant level threshold, feature with p-value higher than SL should be abandoned
    """
    _Xshape = X.shape
    num_of_features = _Xshape[1]
    temp = np.zeros(_Xshape).astype(int)
    
    for i in range(0, num_of_features):
        regressor_OLS = sm.OLS(y, X).fit()
        print (regressor_OLS.summary())
        max_pvalue = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if max_pvalue > SL:
            for j in range(0, num_of_features - i):
                if (regressor_OLS.pvalues[j].astype(float) == max_pvalue):
                    temp[:,j] = X[:, j]
                    X = np.delete(X, j, 1)
                    tmp_regressor = sm.OLS(y, X).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    print (tmp_regressor.summary())
                    if (adjR_before >= adjR_after):
                        X_rollback = np.hstack((X, temp[:,[0,j]]))
                        X_rollback = np.delete(X_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return X_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return X

In [None]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
dataset.head()
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

# Avoiding the Dummy Variable Trap, remove redudant column
print("with dummy variabe")
X[:3]
X = X[:, 1:]
print("without dummy variabe")
X[:3]

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [None]:
# Try to get the significant features considering only P value
X_with_ones=sm.add_constant(X)
X_return = backwardEliminationWithPvalueOnly(X=X_with_ones, y=y, SL=0.05)
X_return

In [None]:
# Try to get the significant features considering P value and adjusted R sqaure
# Method 1
X_return = backwardEliminationWithRsquare(X=X_with_ones, y=y, SL=0.05)
X_return

In [None]:
# Try to get the significant features considering P value and adjusted R sqaure
# Method 2
X_return = backwardElimination(X=X_with_ones, y=y, SL=0.05)
X_return