# Multiple Linear Regression

### Data preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv("50_Startups.csv")

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [4]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

In [5]:
X, y 

(array([[165349.2, 136897.8, 471784.1, 'New York'],
        [162597.7, 151377.59, 443898.53, 'California'],
        [153441.51, 101145.55, 407934.54, 'Florida'],
        [144372.41, 118671.85, 383199.62, 'New York'],
        [142107.34, 91391.77, 366168.42, 'Florida'],
        [131876.9, 99814.71, 362861.36, 'New York'],
        [134615.46, 147198.87, 127716.82, 'California'],
        [130298.13, 145530.06, 323876.68, 'Florida'],
        [120542.52, 148718.95, 311613.29, 'New York'],
        [123334.88, 108679.17, 304981.62, 'California'],
        [101913.08, 110594.11, 229160.95, 'Florida'],
        [100671.96, 91790.61, 249744.55, 'California'],
        [93863.75, 127320.38, 249839.44, 'Florida'],
        [91992.39, 135495.07, 252664.93, 'California'],
        [119943.24, 156547.42, 256512.92, 'Florida'],
        [114523.61, 122616.84, 261776.23, 'New York'],
        [78013.11, 121597.55, 264346.06, 'California'],
        [94657.16, 145077.58, 282574.31, 'New York'],
        [91749.1

#### Creating dummy variable for State column

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [3])],     remainder='passthrough')
X=np.array(columnTransformer.fit_transform(X),dtype=np.str_)

In [8]:
X

array([['0.0', '0.0', '1.0', '165349.2', '136897.8', '471784.1'],
       ['1.0', '0.0', '0.0', '162597.7', '151377.59', '443898.53'],
       ['0.0', '1.0', '0.0', '153441.51', '101145.55', '407934.54'],
       ['0.0', '0.0', '1.0', '144372.41', '118671.85', '383199.62'],
       ['0.0', '1.0', '0.0', '142107.34', '91391.77', '366168.42'],
       ['0.0', '0.0', '1.0', '131876.9', '99814.71', '362861.36'],
       ['1.0', '0.0', '0.0', '134615.46', '147198.87', '127716.82'],
       ['0.0', '1.0', '0.0', '130298.13', '145530.06', '323876.68'],
       ['0.0', '0.0', '1.0', '120542.52', '148718.95', '311613.29'],
       ['1.0', '0.0', '0.0', '123334.88', '108679.17', '304981.62'],
       ['0.0', '1.0', '0.0', '101913.08', '110594.11', '229160.95'],
       ['1.0', '0.0', '0.0', '100671.96', '91790.61', '249744.55'],
       ['0.0', '1.0', '0.0', '93863.75', '127320.38', '249839.44'],
       ['1.0', '0.0', '0.0', '91992.39', '135495.07', '252664.93'],
       ['0.0', '1.0', '0.0', '119943.24', '1

#### Avoiding dummy variable trap

In [9]:
X = X[:, 1:]

#### Splitting data into Train and Test Set 

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [26]:
import pandas as pd

# Convert to DataFrame for easier handling
X_train = pd.DataFrame(X_train)

# Replace non-numeric columns with NaNs (if necessary)
X_train = X_train.apply(pd.to_numeric, errors='coerce')

# Fill NaNs with column means
X_train = X_train.fillna(X_train.mean())

# Convert back to a numpy array if needed
X_train = X_train.values


In [27]:
X_train, y_train

(array([[1.0000000e+00, 0.0000000e+00, 5.5493950e+04, 1.0305749e+05,
         2.1463481e+05],
        [0.0000000e+00, 1.0000000e+00, 4.6014020e+04, 8.5047440e+04,
         2.0551764e+05],
        [1.0000000e+00, 0.0000000e+00, 7.5328870e+04, 1.4413598e+05,
         1.3405007e+05],
        [0.0000000e+00, 0.0000000e+00, 4.6426070e+04, 1.5769392e+05,
         2.1079767e+05],
        [1.0000000e+00, 0.0000000e+00, 9.1749160e+04, 1.1417579e+05,
         2.9491957e+05],
        [1.0000000e+00, 0.0000000e+00, 1.3029813e+05, 1.4553006e+05,
         3.2387668e+05],
        [1.0000000e+00, 0.0000000e+00, 1.1994324e+05, 1.5654742e+05,
         2.5651292e+05],
        [0.0000000e+00, 1.0000000e+00, 1.0002300e+03, 1.2415304e+05,
         1.9039300e+03],
        [0.0000000e+00, 1.0000000e+00, 5.4205000e+02, 5.1743150e+04,
         0.0000000e+00],
        [0.0000000e+00, 1.0000000e+00, 6.5605480e+04, 1.5303206e+05,
         1.0713838e+05],
        [0.0000000e+00, 1.0000000e+00, 1.1452361e+05, 1.2261

In [28]:
X_test, y_test

(array([['1.0', '0.0', '66051.52', '182645.56', '118148.2'],
        ['0.0', '0.0', '100671.96', '91790.61', '249744.55'],
        ['1.0', '0.0', '101913.08', '110594.11', '229160.95'],
        ['1.0', '0.0', '27892.92', '84710.77', '164470.71'],
        ['1.0', '0.0', '153441.51', '101145.55', '407934.54'],
        ['0.0', '1.0', '72107.6', '127864.55', '353183.81'],
        ['0.0', '1.0', '20229.59', '65947.93', '185265.1'],
        ['0.0', '1.0', '61136.38', '152701.92', '88218.23'],
        ['1.0', '0.0', '73994.56', '122782.75', '303319.26'],
        ['1.0', '0.0', '142107.34', '91391.77', '366168.42']], dtype='<U9'),
 array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
         81229.06,  97483.56, 110352.25, 166187.94]))

### Fitting Multiple Linear Regression to the Training set

In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)


### Predicting values from the Test set

In [32]:
import pandas as pd

# Convert to DataFrame for easier handling
X_test = pd.DataFrame(X_test)

# Convert non-numeric columns to NaNs (if necessary)
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Fill NaNs with column means or other appropriate values
X_test = X_test.fillna(X_test.mean())

# Convert back to a numpy array if needed
X_test = X_test.values


In [33]:
y_pred = regressor.predict(X_test)

In [34]:
y_pred

array([103015.20159795, 132582.27760816, 132447.73845175,  71976.09851258,
       178537.48221057, 116161.24230165,  67851.69209676,  98791.73374687,
       113969.43533013, 167921.06569552])

Let's compare it with the actual values

In [35]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

### Checking the accuracy of the model

In [36]:
accuracy = regressor.score(X_test,y_test)
print('Accuracy of the model is',accuracy*100,'%')

Accuracy of the model is 93.47068473283092 %


### Building the optimal model using Backward Elimination

In [38]:
pip install statsmodels

Collecting statsmodels
  Obtaining dependency information for statsmodels from https://files.pythonhosted.org/packages/59/9a/e466a1b887a1441141e52dbcc98152f013d85076576da6eed2357f2016ae/statsmodels-0.14.4-cp312-cp312-win_amd64.whl.metadata
  Downloading statsmodels-0.14.4-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Obtaining dependency information for patsy>=0.5.6 from https://files.pythonhosted.org/packages/43/f3/1d311a09c34f14f5973bb0bb0dc3a6e007e1eda90b5492d082689936ca51/patsy-0.5.6-py2.py3-none-any.whl.metadata
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading statsmodels-0.14.4-cp312-cp312-win_amd64.whl (9.8 MB)
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.8 MB 217.9 kB/s eta 0:00:45
   ---------------------------------------- 0.1/9.8 MB 737.3 kB/s eta 0:00:14
   -- ---


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
import statsmodels.api as sm

In [40]:
X = np.append(arr = np.ones((50,1)).astype(int), values = X , axis = 1)

In [41]:
X

array([['1', '0.0', '1.0', '165349.2', '136897.8', '471784.1'],
       ['1', '0.0', '0.0', '162597.7', '151377.59', '443898.53'],
       ['1', '1.0', '0.0', '153441.51', '101145.55', '407934.54'],
       ['1', '0.0', '1.0', '144372.41', '118671.85', '383199.62'],
       ['1', '1.0', '0.0', '142107.34', '91391.77', '366168.42'],
       ['1', '0.0', '1.0', '131876.9', '99814.71', '362861.36'],
       ['1', '0.0', '0.0', '134615.46', '147198.87', '127716.82'],
       ['1', '1.0', '0.0', '130298.13', '145530.06', '323876.68'],
       ['1', '0.0', '1.0', '120542.52', '148718.95', '311613.29'],
       ['1', '0.0', '0.0', '123334.88', '108679.17', '304981.62'],
       ['1', '1.0', '0.0', '101913.08', '110594.11', '229160.95'],
       ['1', '0.0', '0.0', '100671.96', '91790.61', '249744.55'],
       ['1', '1.0', '0.0', '93863.75', '127320.38', '249839.44'],
       ['1', '0.0', '0.0', '91992.39', '135495.07', '252664.93'],
       ['1', '1.0', '0.0', '119943.24', '156547.42', '256512.92'],
     

In [42]:
X_opt = np.array(X[:, [0, 1, 2, 3, 4, 5]], dtype=float)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [43]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sat, 02 Nov 2024",Prob (F-statistic):,1.34e-27
Time:,10:28:29,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [44]:
X_opt = np.array(X[:, [0, 1, 3, 4, 5]], dtype=float)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [45]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sat, 02 Nov 2024",Prob (F-statistic):,8.49e-29
Time:,10:28:36,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [46]:
X_opt = np.array(X[:, [0, 3, 4, 5]], dtype=float)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [47]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sat, 02 Nov 2024",Prob (F-statistic):,4.53e-30
Time:,10:28:43,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [48]:
X_opt = np.array(X[:, [0, 3, 5]], dtype=float)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [28]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,2.1600000000000003e-31
Time:,14:44:58,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [29]:
X_opt = np.array(X[:, [0, 3]], dtype=float)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [30]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,3.5000000000000004e-32
Time:,14:44:58,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [31]:
X_opt

array([[1.0000000e+00, 1.6534920e+05],
       [1.0000000e+00, 1.6259770e+05],
       [1.0000000e+00, 1.5344151e+05],
       [1.0000000e+00, 1.4437241e+05],
       [1.0000000e+00, 1.4210734e+05],
       [1.0000000e+00, 1.3187690e+05],
       [1.0000000e+00, 1.3461546e+05],
       [1.0000000e+00, 1.3029813e+05],
       [1.0000000e+00, 1.2054252e+05],
       [1.0000000e+00, 1.2333488e+05],
       [1.0000000e+00, 1.0191308e+05],
       [1.0000000e+00, 1.0067196e+05],
       [1.0000000e+00, 9.3863750e+04],
       [1.0000000e+00, 9.1992390e+04],
       [1.0000000e+00, 1.1994324e+05],
       [1.0000000e+00, 1.1452361e+05],
       [1.0000000e+00, 7.8013110e+04],
       [1.0000000e+00, 9.4657160e+04],
       [1.0000000e+00, 9.1749160e+04],
       [1.0000000e+00, 8.6419700e+04],
       [1.0000000e+00, 7.6253860e+04],
       [1.0000000e+00, 7.8389470e+04],
       [1.0000000e+00, 7.3994560e+04],
       [1.0000000e+00, 6.7532530e+04],
       [1.0000000e+00, 7.7044010e+04],
       [1.0000000e+00, 6.

### Analysing the results of Regression

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_opt,y,test_size = 0.2, random_state = 0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
accuracy = regressor.score(X_test,y_test)
print('Accuracy of the model is',accuracy*100,'%')

Accuracy of the model is 94.6458760778722 %


As you can see that the Backward Elimination helped the model to to increase its accuracy from 93.47% to 94.64%