# Data Preprocessing Template

## Importing the libraries

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [75]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [76]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


## Encoding categorical data

In [77]:
X = pd.get_dummies(X, columns=['State'])

In [78]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


## Splitting the dataset into the Training set and Test set

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression on the Training Set

In [80]:
X_test.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
28,66051.52,182645.56,118148.2,0,1,0
11,100671.96,91790.61,249744.55,1,0,0
10,101913.08,110594.11,229160.95,0,1,0
41,27892.92,84710.77,164470.71,0,1,0
2,153441.51,101145.55,407934.54,0,1,0


In [81]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()

In [82]:
regressor = regressor.fit(X_train, y_train)

## Predicting Test Set Results

In [83]:
y_pred = regressor.predict(X_test)

In [84]:
np.set_printoptions(precision=2)
y_pred

array([103015.2 , 132582.28, 132447.74,  71976.1 , 178537.48, 116161.24,
        67851.69,  98791.73, 113969.44, 167921.07])

### Comparing Test Set Results

In [85]:
np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), 1)

array([[103015.2 , 103282.38],
       [132582.28, 144259.4 ],
       [132447.74, 146121.95],
       [ 71976.1 ,  77798.83],
       [178537.48, 191050.39],
       [116161.24, 105008.31],
       [ 67851.69,  81229.06],
       [ 98791.73,  97483.56],
       [113969.44, 110352.25],
       [167921.07, 166187.94]])

In [90]:
y_pred_df = pd.DataFrame(y_pred, columns=['Predicted Profit'])
y_test_df = pd.DataFrame(y_test, columns=['Profit'])

y_test_df.reset_index(drop=True, inplace=True)
y_pred_df.reset_index(drop=True, inplace=True)

In [92]:
df = pd.concat([y_test_df, y_pred_df], axis=1)
df

Unnamed: 0,Profit,Predicted Profit
0,103282.38,103015.201598
1,144259.4,132582.277608
2,146121.95,132447.738452
3,77798.83,71976.098513
4,191050.39,178537.482211
5,105008.31,116161.242302
6,81229.06,67851.692097
7,97483.56,98791.733747
8,110352.25,113969.43533
9,166187.94,167921.065696


In [100]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error is: {rmse}')

Root Mean Squared Error is: 9137.990152794953


In [101]:
regressor.coef_

array([ 7.73e-01,  3.29e-02,  3.66e-02,  8.66e+01, -8.73e+02,  7.86e+02])

In [103]:
regressor.intercept_

42467.52924855311