# Multiple Linear Regression

## Importing the libraries

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [4]:
dataset = pd.read_csv("data/50_Startups.csv")
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [5]:
print(X.head(10))

   R&D Spend  Administration  Marketing Spend       State
0  165349.20       136897.80        471784.10    New York
1  162597.70       151377.59        443898.53  California
2  153441.51       101145.55        407934.54     Florida
3  144372.41       118671.85        383199.62    New York
4  142107.34        91391.77        366168.42     Florida
5  131876.90        99814.71        362861.36    New York
6  134615.46       147198.87        127716.82  California
7  130298.13       145530.06        323876.68     Florida
8  120542.52       148718.95        311613.29    New York
9  123334.88       108679.17        304981.62  California


In [6]:
print (y.head(10))

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
5    156991.12
6    156122.51
7    155752.60
8    152211.77
9    149759.96
Name: Profit, dtype: float64


## Encode Categorical Data

In [7]:
from sklearn.preprocessing import OneHotEncoder

# Define which categorical columns that we are going to encode
cat_features = ["State"]

# Encode the categorical columns
encoder = OneHotEncoder(sparse=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X.loc[:, cat_features]))
X_encoded.columns = encoder.get_feature_names(cat_features)

# Combine X with encoded columns
X.drop(cat_features ,axis=1, inplace=True)
X = pd.concat([X_encoded, X], axis=1)

In [9]:
print(X.head(5))

   State_California  State_Florida  State_New York  R&D Spend  Administration  \
0               0.0            0.0             1.0  165349.20       136897.80   
1               1.0            0.0             0.0  162597.70       151377.59   
2               0.0            1.0             0.0  153441.51       101145.55   
3               0.0            0.0             1.0  144372.41       118671.85   
4               0.0            1.0             0.0  142107.34        91391.77   

   Marketing Spend  
0        471784.10  
1        443898.53  
2        407934.54  
3        383199.62  
4        366168.42  


## Splitting the dataset into Training set and Test set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

40 10 40 10


## Training the Multiple Linear Regression model on the Training Set

In [12]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

## Predicting the Test set results

In [44]:
y_pred = regressor.predict(X_test) # the vector of predicted label
pd.set_option("display.precision", 2) # set the printed format into 2 decimal
result_dataset = pd.DataFrame({"Predicted Profit" : y_pred, 
                              "Actual Profit" : y_test,
                              "Error" : y_pred - y_test})
print(result_dataset.to_string(index=False))

 Predicted Profit  Actual Profit     Error
        103015.20      103282.38   -267.18
        132582.28      144259.40 -11677.12
        132447.74      146121.95 -13674.21
         71976.10       77798.83  -5822.73
        178537.48      191050.39 -12512.91
        116161.24      105008.31  11152.93
         67851.69       81229.06 -13377.37
         98791.73       97483.56   1308.17
        113969.44      110352.25   3617.19
        167921.07      166187.94   1733.13


## Getting the final linear regression equation with the values of the coefficients

In [41]:
print(f"Regression Coefficient : {regressor.coef_}")
print(f"Regression Intercept : {regressor.intercept_}")

Regression Coefficient : [ 8.66e+01 -8.73e+02  7.86e+02  7.73e-01  3.29e-02  3.66e-02]
Regression Intercept : 42467.529248519335


Therefore, the equation of our multiple linear regression model is:

$$\textrm{Profit} = 86.6 \times \textrm{State_California} - 873 \times \textrm{State_Florida} + 786 \times \textrm{State_New York} + 0.773 \times \textrm{R&D Spend} + 0.0329 \times \textrm{Administration} + 0.0366 \times \textrm{Marketing Spend} + 42467.53$$

**Important Note:** To get these coefficients we called the "coef_" and "intercept_" attributes from our regressor object. Attributes in Python are different than methods and usually return a simple value or an array of values.