In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
ds = pd.read_csv('MLR_mix.csv')

In [4]:
ds.head()

Unnamed: 0,S&P department (spend),Admin department(spend),Operation department(spend),state,Profit
0,101184,70674,97009,A,259480
1,70132,52030,63587,B,137019
2,149539,98535,199994,A,495552
3,75821,53649,66500,B,165057
4,97730,68667,91754,B,247841


In [5]:
ds.shape

(50, 5)

In [6]:
x = ds.iloc[:, :-1].values
y = ds.iloc[:, 4].values

In [7]:
corr = ds.corr()
print(corr)

ValueError: could not convert string to float: 'A'

---

## Handling or Encode categorical variables

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

labelencoder_x = LabelEncoder()
x[:, 3] = labelencoder_x.fit_transform(x[:, 3])

ct = ColumnTransformer([("State", OneHotEncoder(), [3])], remainder = 'passthrough')
x = ct.fit_transform(x)
print(x)


[[1.0 0.0 101184 70674 97009]
 [0.0 1.0 70132 52030 63587]
 [1.0 0.0 149539 98535 199994]
 [0.0 1.0 75821 53649 66500]
 [0.0 1.0 97730 68667 91754]
 [0.0 1.0 84544 60923 81618]
 [1.0 0.0 117490 85379 146277]
 [0.0 1.0 119913 86499 150579]
 [0.0 1.0 145244 95108 189066]
 [1.0 0.0 147217 95598 190309]
 [0.0 1.0 81746 56815 72351]
 [1.0 0.0 107967 74058 106215]
 [1.0 0.0 74692 52493 64267]
 [1.0 0.0 72228 52210 63719]
 [0.0 1.0 96292 68521 90007]
 [1.0 0.0 124506 86995 153382]
 [0.0 1.0 113543 82796 122345]
 [1.0 0.0 142585 94589 184603]
 [1.0 0.0 118057 86227 146948]
 [1.0 0.0 86260 63082 85320]
 [1.0 0.0 63601 51026 59932]
 [1.0 0.0 113200 76388 115984]
 [0.0 1.0 141331 93725 171453]
 [1.0 0.0 79667 54864 69102]
 [0.0 1.0 112208 75407 115316]
 [1.0 0.0 82622 59754 72987]
 [0.0 1.0 127721 88888 153477]
 [0.0 1.0 117901 85532 146822]
 [0.0 1.0 147364 96275 191812]
 [1.0 0.0 110661 75404 114864]
 [0.0 1.0 107360 73604 100773]
 [0.0 1.0 87194 63403 87468]
 [1.0 0.0 75381 52575 66116]
 [1.0 

**Drop one of the columns after encoding.**

In [9]:
x = x[:, 1:]
print(x)

[[0.0 101184 70674 97009]
 [1.0 70132 52030 63587]
 [0.0 149539 98535 199994]
 [1.0 75821 53649 66500]
 [1.0 97730 68667 91754]
 [1.0 84544 60923 81618]
 [0.0 117490 85379 146277]
 [1.0 119913 86499 150579]
 [1.0 145244 95108 189066]
 [0.0 147217 95598 190309]
 [1.0 81746 56815 72351]
 [0.0 107967 74058 106215]
 [0.0 74692 52493 64267]
 [0.0 72228 52210 63719]
 [1.0 96292 68521 90007]
 [0.0 124506 86995 153382]
 [1.0 113543 82796 122345]
 [0.0 142585 94589 184603]
 [0.0 118057 86227 146948]
 [0.0 86260 63082 85320]
 [0.0 63601 51026 59932]
 [0.0 113200 76388 115984]
 [1.0 141331 93725 171453]
 [0.0 79667 54864 69102]
 [1.0 112208 75407 115316]
 [0.0 82622 59754 72987]
 [1.0 127721 88888 153477]
 [1.0 117901 85532 146822]
 [1.0 147364 96275 191812]
 [0.0 110661 75404 114864]
 [1.0 107360 73604 100773]
 [1.0 87194 63403 87468]
 [0.0 75381 52575 66116]
 [0.0 113427 78653 119763]
 [1.0 103004 71720 100065]
 [1.0 109249 74209 109578]
 [1.0 134157 92647 167690]
 [0.0 116481 83866 141719]
 [1

---

## Use Multiple Linear Regression

In [10]:
regressor = LinearRegression()
regressor.fit(x,y)

In [11]:
print('The coefficient is:', regressor.coef_)
print('The y-intercept is:', regressor.intercept_)

The coefficient is: [-7.54636193e+03  4.93526669e-01  6.74031082e+00 -5.15357914e-02]
The y-intercept is: -234499.13681915222


The multiple linear equation is
$$ \text{Profit} = -234499.13 -0.00753 \times \text{S&P department (spend)} + 0.4935 \times \text{Admin department(spend)} + 6.7403 \times \text{Operation department(spend)} - 0.0515 \times \text{state}$$

---

## Predict

# Question

Estimate the profit gained if S&P department spend RM90020, Admin department spend RM55200, 
Operation department spend RM94230 and the state is B

In [12]:
a = [[1.0, 90020, 55200, 94230]]
y_predict = regressor.predict(a)
print(y_predict)

[169590.71175309]


In [14]:
b = [[0, 90020, 55200, 94230]]
y_predict = regressor.predict(b)
print(y_predict)

[177137.07368062]
