# Regression Techniques

# Multiple linear regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('Companies.csv')

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

In [None]:
X

In [None]:
y

In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Encoding Categorical variable

labelEncoderObj_X = LabelEncoder()
X[:,3] = labelEncoderObj_X.fit_transform(X[:,3])    # converts the value into number and then OneHotEncoder will be able to encode it

In [None]:
X

In [9]:
# one Hot encoder will convert the State column into 3 dummy columns 
onehotencoder = OneHotEncoder(categorical_features = [3] )
X = onehotencoder.fit_transform(X).toarray()

In [None]:
X

In [11]:
# Avoiding the dummy variable trap, dummy variable trap occurs when we have multiple dummy columns
# for a single column which the regression model gets confused with as the values encoded for the categories is 0 or 1
# hence to avoid issues in the end result we need to remove one of the dummy e.g. 3 dummy columns then use 2, 2 dummy columns then use 1 

# though the library takes care of this but just an explicit step as precation that we dont get trapped
# this step can be skipped for this example 
X = X[:,1:]

In [None]:
X

In [13]:
# Training and testing split

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)



In [14]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
y_pred = regressor.predict(X_test)

In [16]:
y_pred

array([103015.20159796, 132582.27760815, 132447.73845175,  71976.09851258,
       178537.48221056, 116161.24230166,  67851.69209676,  98791.73374687,
       113969.43533013, 167921.06569551])

In [17]:
# Building the optimal model by Backward elimination
# As per the linear regression formula  y = b0 + b1*x1 + b2*x2 ..... + bn*xn
# b0 (intercept is not included by default) the constant cannot be used individually in backward elimination process hence we need to introduce x0
# hence using np.append we will add a new column at the start of the X matrix containg all 1's

import statsmodels.formula.api as sm

# arr is the 1st column containing 1 will be created and it that we will fit the X matrix using values parameter axis 1
X = np.append(arr = np.ones((50,1)).astype(int) , values = X, axis = 1)


In [None]:
X

In [19]:
# X_opt is a new variable which will contain the optimal list of independent variables which influence the profit value (dependent variables)
X_opt = X[:,[0,1,2,3,4,5]]

In [None]:
# now to start the elimination we will have to calculate the p value of each dependent variable
# we set the significance level to 0.05 (5%) any independent variable whose p value is less than 5% will stay in the model
# and for those whose p value is more that 5% will get eliminated from the X matrix

In [None]:
# OLS ordinary least square model

In [21]:
            #const, D1,D2,RD,Admin,Mkt
X_opt = X[:,[0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,1.34e-27
Time:,15:52:33,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [None]:
# At this step we removed index 2 (D2 i.e. x2 in model summary) from X matrix as its p values was 0.990 which is more than SL 
# Repeat this step until you get the independent variable set whose p value is less tha SL endog (dependent variable)
# exog is the matrix of features

In [22]:
            #const, D1,RD,Admin,Mkt
X_opt = X[:,[0,1,3,4,5]]    # just a copy of X matrix of features 
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() 
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,8.49e-29
Time:,15:56:32,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [None]:
# At this step we removed index 1 (D1 i.e. x1 in model summary) from X matrix as its p values was 0.940 which is more than SL

In [23]:
        # const,RD,Admin,Mkt
X_opt = X[:,[0,3,4,5]] 
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,4.53e-30
Time:,15:58:43,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [None]:
# At this step we removed index 2 (Admin i.e. x2 in model summary) from X matrix as its p values was 0.602 which is more than SL

In [24]:
          # const,RD,Mkt
X_opt = X[:,[0,3,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,2.1600000000000003e-31
Time:,15:59:46,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [None]:
# At this step we removed index 2 (Mkt i.e. x2 in model summary) from X matrix as its p values was 0.06 which is more than SL

In [25]:
            # const,RD
X_opt = X[:,[0,3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,16:00:30,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [None]:
# y = 0.00049 + 0.8543 * RD