# Multiple Linear Regression

![image.png](attachment:image.png)

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

### Step 1: Read the dataset

In [2]:
import pandas as pd 
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


### PROFIT ~ RND, ADMIN, MKT

### Step 2: Perform Basic Data quality checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [5]:
df.duplicated().sum()

0

### There are no missing values and duplicates in this data , hence data is clean

### Step 3 : Seperate X and Y(PROFIT)

In [6]:
X = df[['RND', 'ADMIN', 'MKT']]
Y = df[['PROFIT']]

In [7]:
X.head()

Unnamed: 0,RND,ADMIN,MKT
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [8]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


### Profit_pred = B0 + B1xRND + B2xADMIN + B3xMKT

### Step 4: Build the Linear Regression model

In [9]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, Y)

### Getting coefficeint and intercept

In [10]:
model.intercept_

array([50122.19298987])

In [11]:
model.coef_

array([[ 0.80571505, -0.02681597,  0.02722806]])

In [12]:
X.columns

Index(['RND', 'ADMIN', 'MKT'], dtype='object')

### Profit_pred = 50122.19 + 0.8057xRND - 0.0268xADMIN + 0.0272xMKT

### RND has highest impact on profit
### ADMIN and MKT do not have major impact on profit

### Evaluate the model

In [13]:
# R2 score
model.score(X, Y)

0.9507459940683246

### Predict the results for X

In [15]:
ypred = model.predict(X)
ypred[0:5]

array([[192521.25289008],
       [189156.76823227],
       [182147.2790962 ],
       [173696.70002553],
       [172139.51418327]])

In [16]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


### Get other metrics

In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, \
                            mean_absolute_percentage_error, r2_score

In [18]:
mse = mean_squared_error(Y, ypred)
mse

78417126.01913083

In [19]:
rmse = mse**(1/2)
rmse

8855.34448901514

In [20]:
mae = mean_absolute_error(Y, ypred)
mae

6471.4503961048085

In [21]:
mape = mean_absolute_percentage_error(Y, ypred)
mape

0.10601209160494125

In [22]:
r2 = r2_score(Y, ypred)
r2

0.9507459940683246

### Because R2 Score is 0.9507 > 0.8 hence model is good, can be used for out of sample prediction

### Out of Sample Predictions

In [24]:
X.columns

Index(['RND', 'ADMIN', 'MKT'], dtype='object')

In [26]:
X.values[0:5]

array([[165349.2 , 136897.8 , 471784.1 ],
       [162597.7 , 151377.59, 443898.53],
       [153441.51, 101145.55, 407934.54],
       [144372.41, 118671.85, 383199.62],
       [142107.34,  91391.77, 366168.42]])

In [33]:
# [[rnd, admin, mkt]]
rnd = 100000
admin = 60000
mkt = 120000
xnew = [[rnd, admin, mkt]]
model.predict(xnew)

array([[132352.10765385]])