## Build a machine learning model to predict profit of the company  based on different expenses for a given dataset


### Data Preprocessing

In [1]:
# importing needed libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import sys
sys.path.append('/home/admin1/PycharmProjects/Machine Learning from scratch/')
from ipynb.fs.full.ml_library import *

# importing dataset & storing it as pandas dataframe
startup_data = pd.read_csv('50_Startups.csv')
startup_data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


#### Checking for null values

In [2]:
startup_data.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [3]:
startup_data.describe()            # observing statistical summary of columns

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


#### Separating out features & labels

In [4]:
# for numerical columns
x_values = startup_data.loc[:,['R&D Spend', 'Administration', 'Marketing Spend']].values
y_values = startup_data['Profit'].values

#### One hot encoding using sklearn

In [5]:
categorical_cols = ['State']
one_hot_encode = OneHotEncoder()

new_columns = one_hot_encode.fit_transform(startup_data.loc[:,categorical_cols]).toarray()
x_values = np.append(x_values,new_columns, axis=1)

# one-hot-encoding for 3 categoried columns 2 categories encoded as one-hot-encoding is enough   
x_values = x_values[:,:-1]

#### Column Selection using backward elimination

In [6]:
# We are going to significance of columns using null hypothesis for creating models

In [7]:
# Adding columns of 1's at start for intercept(contsant)
x_values = np.insert(x_values,0, 1, axis=1)
x_values[:5]

array([[1.0000000e+00, 1.6534920e+05, 1.3689780e+05, 4.7178410e+05,
        0.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 1.6259770e+05, 1.5137759e+05, 4.4389853e+05,
        1.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 1.5344151e+05, 1.0114555e+05, 4.0793454e+05,
        0.0000000e+00, 1.0000000e+00],
       [1.0000000e+00, 1.4437241e+05, 1.1867185e+05, 3.8319962e+05,
        0.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 1.4210734e+05, 9.1391770e+04, 3.6616842e+05,
        0.0000000e+00, 1.0000000e+00]])

In [8]:
# Using OLS-Ordinary Least Squares from statsmodels library
import statsmodels.regression.linear_model as sm

# we have set significance level 0.05 & checking for colums for p-value more than 0.05 meaning that are insignificant
# discarding a column with p-value maximum & more than 0.05 at each stage & observing adjusted R -squared score

# Let's take all columns (with added column for intercept) into our assumption 
x_opt = x_values[:,[0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog=y_values, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sat, 25 Jan 2020",Prob (F-statistic):,1.34e-27
Time:,13:19:07,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.008e+04,6952.587,7.204,0.000,3.61e+04,6.41e+04
x1,0.8060,0.046,17.369,0.000,0.712,0.900
x2,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x3,0.0270,0.017,1.574,0.123,-0.008,0.062
x4,41.8870,3256.039,0.013,0.990,-6520.229,6604.003
x5,240.6758,3338.857,0.072,0.943,-6488.349,6969.701

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1470000.0


In [9]:
# discarding column with maximum p-value
x_opt = x_values[:,[0,1,2,3,5]]
regressor_OLS = sm.OLS(endog=y_values, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sat, 25 Jan 2020",Prob (F-statistic):,8.49e-29
Time:,13:19:07,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,0.8060,0.046,17.606,0.000,0.714,0.898
x2,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x3,0.0270,0.017,1.592,0.118,-0.007,0.061
x4,220.1585,2900.536,0.076,0.940,-5621.821,6062.138

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [10]:
x_opt = x_values[:,[0,1,2,3]]
regressor_OLS = sm.OLS(endog=y_values, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sat, 25 Jan 2020",Prob (F-statistic):,4.53e-30
Time:,13:19:07,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [11]:
x_opt = x_values[:,[0,1,3]]
regressor_OLS = sm.OLS(endog=y_values, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Sat, 25 Jan 2020",Prob (F-statistic):,2.1600000000000003e-31
Time:,13:19:07,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [12]:
x_opt = x_values[:,[0,1]]
regressor_OLS = sm.OLS(endog=y_values, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sat, 25 Jan 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,13:19:07,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [13]:
# observered that even column discarding column with p-value greater than 0.05 at last step there is decrease in 
# adjusted R-squared error so we are keeping columns at penultimate step

In [14]:
x_values = x_values[:,[1,3]]
x_values[:10]

array([[165349.2 , 471784.1 ],
       [162597.7 , 443898.53],
       [153441.51, 407934.54],
       [144372.41, 383199.62],
       [142107.34, 366168.42],
       [131876.9 , 362861.36],
       [134615.46, 127716.82],
       [130298.13, 323876.68],
       [120542.52, 311613.29],
       [123334.88, 304981.62]])

#### Splitting dataset into train set & test set

In [15]:
train_x_values, test_x_values, train_y_values, test_y_values = train_test_split(x_values, y_values, train_size= 0.7, random_state=1)

### Building multi linear regression model

In [16]:
regressor = LinearRegression()
regressor.fit(train_x_values, train_y_values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
# printing paramater/coefficient for each feature column
regressor.coef_

array([0.76452582, 0.03678394])

#### Storing predictions for train set & test set

In [18]:
train_prediction = regressor.predict(train_x_values)
test_prediction = regressor.predict(test_x_values)

#### Calculating errors

In [19]:
mean_abs_error = metrics.mean_absolute_error(test_y_values, test_prediction)
mean_abs_error

7241.485747807864

In [20]:
mean_sqr_error = metrics.mean_squared_error(test_y_values, test_prediction)
mean_sqr_error

74764929.0575132

In [21]:
root_mean_sqr_error = np.sqrt(mean_sqr_error)
root_mean_sqr_error

8646.67155947959

### Evaluating model against test set

In [22]:
print(f'r2_score: {regressor.score(test_x_values, test_y_values)}')

r2_score: 0.9528624476769572


In [23]:
accuracy_test = accuracy_regression(test_prediction, test_y_values)       # accuracy using absolute error
accuracy_test

91.14263835043789