<a href="https://colab.research.google.com/github/tirtharajghosh/Machine-Learning/blob/master/Multiple_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing required libraries

In [81]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing dataset

In [82]:
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Datasets/50_Startups.csv')
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

#print("Value of X :\n",x)
#print("Value of Y :\n",y)

### Encoding categorical data - Dummy Variables
Note: We don't always need to use `LabelEncoder` anymore. Instead used `ColumnTransfer`.

In [83]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("State", OneHotEncoder(), [3])], remainder = 'passthrough')
x = ct.fit_transform(x)

#print("Value of X after Dummy Variable Encoding:\n",x)

### Avoiding the Dummy Variable Trap
It is a good practise to avoid Dummy Variable Trap manually. Though Scikit Learn Library takes care of this, we don't wanna create any dependencies.

In [84]:
x = x[:,1:]
#print("Value of X :",x)

### Splitting dataset - Training and Test
  Note: `cross_validation` name is now deprecated and was replaced by `model_selection`



In [85]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 0)

### Fitting Multiple Linear Regression to the Training Set

In [86]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Predicting the Test Set Result

In [87]:
from tabulate import tabulate
y_pred = regressor.predict(x_test)
print(tabulate(zip(y_test,y_pred), headers=["Test","Prediction"], tablefmt="github"))

|     Test |   Prediction |
|----------|--------------|
| 103282   |     103015   |
| 144259   |     132582   |
| 146122   |     132448   |
|  77798.8 |      71976.1 |
| 191050   |     178537   |
| 105008   |     116161   |
|  81229.1 |      67851.7 |
|  97483.6 |      98791.7 |
| 110352   |     113969   |
| 166188   |     167921   |


### Building the optimal model using Backward Elimination 
SL = 0.05 & (P-Value > SL) supported columns should be removed.

In [88]:
import statsmodels.api as sm
x = np.append(arr = np.ones((50,1), dtype = int), values = x, axis = 1 )
#print(x)

Stage 1

In [92]:
x_opt = np.array(x[:,[0, 1, 2, 3, 4, 5]], dtype=int)
regressor_OLS = sm.OLS(endog = y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Fri, 31 Jul 2020",Prob (F-statistic):,1.34e-27
Time:,20:19:11,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.855,7.281,0.000,3.63e+04,6.4e+04
x1,198.7542,3371.026,0.059,0.953,-6595.103,6992.611
x2,-42.0063,3256.058,-0.013,0.990,-6604.161,6520.148
x3,0.8060,0.046,17.368,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.783,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.267
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


Stage 2

In [93]:
x_opt = np.array(x[:,[0, 1, 3, 4, 5]], dtype=int)
regressor_OLS = sm.OLS(endog = y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Fri, 31 Jul 2020",Prob (F-statistic):,8.49e-29
Time:,20:20:09,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.901,7.537,0.000,3.67e+04,6.35e+04
x1,220.1847,2900.553,0.076,0.940,-5621.828,6062.197
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.759,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.173
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


Stage 3

In [95]:
x_opt = np.array(x[:,[0, 3, 4, 5]], dtype=int)
regressor_OLS = sm.OLS(endog = y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Fri, 31 Jul 2020",Prob (F-statistic):,4.53e-30
Time:,20:21:35,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.384,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.839,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.443
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.587,Cond. No.,1400000.0


Stage 4

In [96]:
x_opt = np.array(x[:,[0, 3, 5]], dtype=int)
regressor_OLS = sm.OLS(endog = y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Fri, 31 Jul 2020",Prob (F-statistic):,2.1600000000000003e-31
Time:,20:22:08,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.941,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.265,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.678,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.162
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


Stage 5

In [97]:
x_opt = np.array(x[:,[0, 3]], dtype=int)
regressor_OLS = sm.OLS(endog = y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Fri, 31 Jul 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,20:23:04,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.900,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.538
Skew:,-0.911,Prob(JB):,9.43e-05
Kurtosis:,5.361,Cond. No.,165000.0
