# Multiple Linear Regression

### Data preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv("50_Startups.csv")

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [4]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

In [5]:
X, y 

(array([[165349.2, 136897.8, 471784.1, 'New York'],
        [162597.7, 151377.59, 443898.53, 'California'],
        [153441.51, 101145.55, 407934.54, 'Florida'],
        [144372.41, 118671.85, 383199.62, 'New York'],
        [142107.34, 91391.77, 366168.42, 'Florida'],
        [131876.9, 99814.71, 362861.36, 'New York'],
        [134615.46, 147198.87, 127716.82, 'California'],
        [130298.13, 145530.06, 323876.68, 'Florida'],
        [120542.52, 148718.95, 311613.29, 'New York'],
        [123334.88, 108679.17, 304981.62, 'California'],
        [101913.08, 110594.11, 229160.95, 'Florida'],
        [100671.96, 91790.61, 249744.55, 'California'],
        [93863.75, 127320.38, 249839.44, 'Florida'],
        [91992.39, 135495.07, 252664.93, 'California'],
        [119943.24, 156547.42, 256512.92, 'Florida'],
        [114523.61, 122616.84, 261776.23, 'New York'],
        [78013.11, 121597.55, 264346.06, 'California'],
        [94657.16, 145077.58, 282574.31, 'New York'],
        [91749.1

#### Creating dummy variable for State column

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [3])],     remainder='passthrough')
X=np.array(columnTransformer.fit_transform(X),dtype=np.str)

In [7]:
X

array([['0.0', '0.0', '1.0', '165349.2', '136897.8', '471784.1'],
       ['1.0', '0.0', '0.0', '162597.7', '151377.59', '443898.53'],
       ['0.0', '1.0', '0.0', '153441.51', '101145.55', '407934.54'],
       ['0.0', '0.0', '1.0', '144372.41', '118671.85', '383199.62'],
       ['0.0', '1.0', '0.0', '142107.34', '91391.77', '366168.42'],
       ['0.0', '0.0', '1.0', '131876.9', '99814.71', '362861.36'],
       ['1.0', '0.0', '0.0', '134615.46', '147198.87', '127716.82'],
       ['0.0', '1.0', '0.0', '130298.13', '145530.06', '323876.68'],
       ['0.0', '0.0', '1.0', '120542.52', '148718.95', '311613.29'],
       ['1.0', '0.0', '0.0', '123334.88', '108679.17', '304981.62'],
       ['0.0', '1.0', '0.0', '101913.08', '110594.11', '229160.95'],
       ['1.0', '0.0', '0.0', '100671.96', '91790.61', '249744.55'],
       ['0.0', '1.0', '0.0', '93863.75', '127320.38', '249839.44'],
       ['1.0', '0.0', '0.0', '91992.39', '135495.07', '252664.93'],
       ['0.0', '1.0', '0.0', '119943.24', '1

#### Avoiding dummy variable trap

In [8]:
X = X[:, 1:]

#### Splitting data into Train and Test Set 

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [10]:
X_train, y_train

(array([['1.0', '0.0', '55493.95', '103057.49', '214634.81'],
        ['0.0', '1.0', '46014.02', '85047.44', '205517.64'],
        ['1.0', '0.0', '75328.87', '144135.98', '134050.07'],
        ['0.0', '0.0', '46426.07', '157693.92', '210797.67'],
        ['1.0', '0.0', '91749.16', '114175.79', '294919.57'],
        ['1.0', '0.0', '130298.13', '145530.06', '323876.68'],
        ['1.0', '0.0', '119943.24', '156547.42', '256512.92'],
        ['0.0', '1.0', '1000.23', '124153.04', '1903.93'],
        ['0.0', '1.0', '542.05', '51743.15', '0.0'],
        ['0.0', '1.0', '65605.48', '153032.06', '107138.38'],
        ['0.0', '1.0', '114523.61', '122616.84', '261776.23'],
        ['1.0', '0.0', '61994.48', '115641.28', '91131.24'],
        ['0.0', '0.0', '63408.86', '129219.61', '46085.25'],
        ['0.0', '0.0', '78013.11', '121597.55', '264346.06'],
        ['0.0', '0.0', '23640.93', '96189.63', '148001.11'],
        ['0.0', '0.0', '76253.86', '113867.3', '298664.47'],
        ['0.0', '1.0',

In [11]:
X_test, y_test

(array([['1.0', '0.0', '66051.52', '182645.56', '118148.2'],
        ['0.0', '0.0', '100671.96', '91790.61', '249744.55'],
        ['1.0', '0.0', '101913.08', '110594.11', '229160.95'],
        ['1.0', '0.0', '27892.92', '84710.77', '164470.71'],
        ['1.0', '0.0', '153441.51', '101145.55', '407934.54'],
        ['0.0', '1.0', '72107.6', '127864.55', '353183.81'],
        ['0.0', '1.0', '20229.59', '65947.93', '185265.1'],
        ['0.0', '1.0', '61136.38', '152701.92', '88218.23'],
        ['1.0', '0.0', '73994.56', '122782.75', '303319.26'],
        ['1.0', '0.0', '142107.34', '91391.77', '366168.42']], dtype='<U9'),
 array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
         81229.06,  97483.56, 110352.25, 166187.94]))

### Fitting Multiple Linear Regression to the Training set

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

### Predicting values from the Test set

In [14]:
y_pred = regressor.predict(X_test)

In [15]:
y_pred

array([103015.20159796, 132582.27760816, 132447.73845175,  71976.09851259,
       178537.48221054, 116161.24230163,  67851.69209676,  98791.73374688,
       113969.43533012, 167921.0656955 ])

Let's compare it with the actual values

In [16]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

### Checking the accuracy of the model

In [17]:
accuracy = regressor.score(X_test,y_test)
print('Accuracy of the model is',accuracy*100,'%')

Accuracy of the model is 93.47068473282857 %
