In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing data

In [2]:
data=pd.read_csv("50_Startups.csv")
print(data)
X=data.iloc[:,:-1].values
Y=data.iloc[:,-1:].values
print(X)
print(Y)

    R&D Spend  Administration  Marketing Spend       State     Profit
0   165349.20       136897.80        471784.10    New York  192261.83
1   162597.70       151377.59        443898.53  California  191792.06
2   153441.51       101145.55        407934.54     Florida  191050.39
3   144372.41       118671.85        383199.62    New York  182901.99
4   142107.34        91391.77        366168.42     Florida  166187.94
5   131876.90        99814.71        362861.36    New York  156991.12
6   134615.46       147198.87        127716.82  California  156122.51
7   130298.13       145530.06        323876.68     Florida  155752.60
8   120542.52       148718.95        311613.29    New York  152211.77
9   123334.88       108679.17        304981.62  California  149759.96
10  101913.08       110594.11        229160.95     Florida  146121.95
11  100671.96        91790.61        249744.55  California  144259.40
12   93863.75       127320.38        249839.44     Florida  141585.52
13   91992.39       

# Encoding Categorial data

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder="passthrough")
#Fit and transform will encode "state" column in index 3 and place it in beginning(first)
X=np.array(ct.fit_transform(X))
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3



# Splitting training and test data set

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state =0)



# Training Multiple Linear Regression model on training set

In [5]:
from sklearn.linear_model import LinearRegression
# sklearn LinearRegression(Model selection) filter high Significant Value(SV) and eliminates least SV(high deviation value)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
# Formula--> y=b0+b1x1+b2x2+..._bnxn
# We don't need to provide feature scaling in linear regression since we are adding coefficient
regressor = LinearRegression()
regressor.fit(X_train,Y_train)

LinearRegression()

# Predicting the test set

In [6]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2) # Print only 2 decimal point
# y_pred.reshape(len(y_pred),1)--> len(y_pred)=No. of rows & 1 column. 
# concatenate=1st Arg - tuple of same length array & axis, 2nd Arg - axis(0-Horizontal & 1-Vertical)
# https://www.superdatascience.com/pages/ml-regression-bonus-2(https://colab.research.google.com/drive/1ABjLFzknByfU4-F4roa1hX36H3aZlu6J?usp=sharing)
print(np.concatenate((y_pred.reshape(len(y_pred),1),Y_test.reshape(len(Y_test),1)),1))# Print value Vertically

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


# Making a single prediction (for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California')

In [7]:
print(regressor.predict([[1.0,0.0,0.0,160000,130000,300000]]))
print(regressor.predict([[1.0,0.0,0.0,162597.7,151377.59,443898.53]]))

[[181566.92]]
[[189547.28]]


# Getting the final linear regression equation with the values of the coefficients

In [8]:
print(regressor.coef_)
print(regressor.intercept_)

[[ 8.66e+01 -8.73e+02  7.86e+02  7.73e-01  3.29e-02  3.66e-02]]
[42467.53]


# Analysing test set with R squared

In [12]:
from sklearn.metrics import r2_score
score = r2_score(Y_test,y_pred)
print(score)

0.9347068473282303
