In [2]:
# Multiple Linear Regression Using Backward Elimination Based On p-values

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

# Avoiding the Dummy Variable Trap
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)


# Backward Elimination
import statsmodels.api as sn
X_test_red = sn.add_constant(X_test)
X_train_red = sn.add_constant(X_train)

"""
OR
X = np.column_stack(np.ones((len(X), 1)), X)
"""

import statsmodels.formula.api as sm

max_val = 1
sl = 0.05
while(max_val > sl):
    regressor_ols = sm.OLS(endog = y_train, exog = X_train_red).fit()
    a = pd.Series(regressor_ols.pvalues)
    max_val = a.max()
    if(a.max() > sl):
        a = a[a == a.max()]
        a = np.array(a.index)
        X_train_red = np.delete(X_train_red, a, axis = 1)
        X_test_red = np.delete(X_test_red, a, axis = 1)
        
y_pred_elim = regressor_ols.predict(X_test_red)

from sklearn.metrics import r2_score
print("Score without backward elimination: {}".format(r2_score(y_test, y_pred)))
print("Score with backward elimination: {}".format(r2_score(y_test, y_pred_elim)))

Score without backward elimination: 0.9347068473282446
Score with backward elimination: 0.9474386447268482


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
