In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [69]:
#Import dataset
df = pd.read_csv('C:/Users/willi/OneDrive/Escritorio/Proyectos/Mahine_Learning_Project/Datasets/Ob_2_Startups_Data.csv')
df.head(3)

Unnamed: 0,StartUp,R&D Spend,Administration,Marketing Spend,State,Profit
0,NovoTech,165349.2,136897.8,471784.1,New York,192261.83
1,Voxify,162597.7,151377.59,443898.53,California,191792.06
2,ZephyrTech,153441.51,101145.55,407934.54,Florida,191050.39


In [71]:
#clean the data frame from missing info
df = df.replace(0, np.nan)
df = df.dropna(subset=["R&D Spend","Administration","Marketing Spend","Profit"])
df.head(3)

Unnamed: 0,StartUp,R&D Spend,Administration,Marketing Spend,State,Profit
0,NovoTech,165349.2,136897.8,471784.1,New York,192261.83
1,Voxify,162597.7,151377.59,443898.53,California,191792.06
2,ZephyrTech,153441.51,101145.55,407934.54,Florida,191050.39


In [72]:
#obtain the 4 variables that make up the feature matrix
X = df.iloc[:, 1:-1].values

#obtain prediction vector
Y = df.iloc[:, 5].values

In [73]:
#code the categoric data, transform the State column into 0 and 1, built the dummy variable
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = make_column_transformer((OneHotEncoder(), [3]), remainder = "passthrough")
X = onehotencoder.fit_transform(X)


In [74]:
#rememeber to eliminate 1 dummy variable, to avoid the problem of fiction variables
#if we have 3 dummy variables, we just use 2
X = X[:, 1:]

In [75]:
#Splitting the dataset into a training set and a testing set.
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [76]:
#Fit the multiple linear regression model with the training set
regression = LinearRegression()
regression.fit(X_train, y_train)

In [77]:
#Prediction of the results on the testing set.
y_pred = regression.predict(X_test)
y_pred = np.round(y_pred, decimals=2)
y_pred

array([111740.74, 110856.78, 103738.34, 169845.61, 135394.84, 102490.98,
       101657.66, 136803.  , 117643.37,  99796.45])

In [78]:
#Compare the Prediction Profit VS Actual Profit
df_pred_test = pd.DataFrame({"Prediction_Profit": y_pred, "Actual_Profit": y_test})
df_pred_test

Unnamed: 0,Prediction_Profit,Actual_Profit
0,111740.74,105733.54
1,110856.78,108733.99
2,103738.34,101004.64
3,169845.61,166187.94
4,135394.84,146121.95
5,102490.98,97427.84
6,101657.66,99937.59
7,136803.0,144259.4
8,117643.37,111313.02
9,99796.45,96778.92


Finding the set of variables that make the model most optimal using backward elimination.

In [79]:
#A backward regression can be performed using the statsmodels library,
#but a new regression model needs to be built with this library.
import statsmodels.api as sm

#the sm library takes the one´s column as the intercept term
#add a one´s column to the X df so we can identify it as a intercep term (termino independiente)
X = np.append(arr = np.ones((46,1)).astype(int), values = X, axis = 1) 

#set the Signification Level(SL) if the a variable is lower than SL the variable works
#if the variable is higher than SL, eliminate the variable
SL = 0.05 

In [80]:
#ODS  (Ordinary List Squares) 
#this thechnique helps to find  the coefficient values of the linear model 
#that minimize the sum of the squared differences between the observed values 
#and the values predicted by the model.
#The goal is to find the hyperplane that best fits the data.


#Create a variable to store the optimal number of independent variables. (X_opt)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]

#built the model
regression_OLS = sm.OLS(endog = Y, exog = X_opt.tolist()).fit()

#show information about the model
regression_OLS.summary()

#look for the variable that has the higher P Value
#for this case its x2 (from X_opt it´s the column 2) 
# with a P value 0.990 and higher than SL = 0.05


#const and x3 has a P value = 0.000 that mean they are high significant values

0,1,2,3
Dep. Variable:,y,R-squared:,0.961
Model:,OLS,Adj. R-squared:,0.956
Method:,Least Squares,F-statistic:,198.7
Date:,"Thu, 13 Jul 2023",Prob (F-statistic):,3.9600000000000006e-27
Time:,13:09:47,Log-Likelihood:,-472.52
No. Observations:,46,AIC:,957.0
Df Residuals:,40,BIC:,968.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.977e+04,6126.191,9.757,0.000,4.74e+04,7.22e+04
x1,-1752.4344,2744.968,-0.638,0.527,-7300.222,3795.354
x2,-1597.2294,2764.853,-0.578,0.567,-7185.206,3990.748
x3,0.7751,0.039,19.828,0.000,0.696,0.854
x4,-0.0576,0.045,-1.275,0.210,-0.149,0.034
x5,0.0210,0.015,1.376,0.176,-0.010,0.052

0,1,2,3
Omnibus:,0.445,Durbin-Watson:,1.876
Prob(Omnibus):,0.801,Jarque-Bera (JB):,0.597
Skew:,0.173,Prob(JB):,0.742
Kurtosis:,2.562,Cond. No.,1610000.0


In [81]:
#make the proces again with out x2 (from X_opt its column 2)
X_opt = X[:, [0, 1, 3, 4, 5]]
regression_OLS = sm.OLS(endog = Y, exog = X_opt.tolist()).fit()
regression_OLS.summary()

#in this case x1 (from X_opt its column 1) is higher than SL = 0.05
#and also the higher value from the list with a P value = 0.940

0,1,2,3
Dep. Variable:,y,R-squared:,0.961
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,252.4
Date:,"Thu, 13 Jul 2023",Prob (F-statistic):,2.74e-28
Time:,13:09:49,Log-Likelihood:,-472.71
No. Observations:,46,AIC:,955.4
Df Residuals:,41,BIC:,964.6
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.937e+04,6035.758,9.836,0.000,4.72e+04,7.16e+04
x1,-930.9428,2328.756,-0.400,0.691,-5633.961,3772.076
x2,0.7760,0.039,20.030,0.000,0.698,0.854
x3,-0.0597,0.045,-1.338,0.188,-0.150,0.030
x4,0.0201,0.015,1.334,0.190,-0.010,0.051

0,1,2,3
Omnibus:,0.24,Durbin-Watson:,1.834
Prob(Omnibus):,0.887,Jarque-Bera (JB):,0.429
Skew:,0.103,Prob(JB):,0.807
Kurtosis:,2.575,Cond. No.,1590000.0


In [82]:
#make the proces again without x1 (from X_opt its column 1)
X_opt = X[:, [0, 3, 4, 5]]
regression_OLS = sm.OLS(endog = Y, exog = X_opt.tolist()).fit()
regression_OLS.summary()


#in this case x2 (from X_opt its column 4) is higher than SL = 0.05
#and also the higher value from the list with a P value = 0.602

0,1,2,3
Dep. Variable:,y,R-squared:,0.961
Model:,OLS,Adj. R-squared:,0.958
Method:,Least Squares,F-statistic:,343.4
Date:,"Thu, 13 Jul 2023",Prob (F-statistic):,1.47e-29
Time:,13:09:50,Log-Likelihood:,-472.8
No. Observations:,46,AIC:,953.6
Df Residuals:,42,BIC:,960.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.921e+04,5963.323,9.930,0.000,4.72e+04,7.12e+04
x1,0.7771,0.038,20.319,0.000,0.700,0.854
x2,-0.0603,0.044,-1.365,0.179,-0.149,0.029
x3,0.0193,0.015,1.304,0.199,-0.011,0.049

0,1,2,3
Omnibus:,0.344,Durbin-Watson:,1.821
Prob(Omnibus):,0.842,Jarque-Bera (JB):,0.518
Skew:,0.071,Prob(JB):,0.772
Kurtosis:,2.5,Cond. No.,1590000.0


In [83]:
#make the proces again without x2 (from X_opt its column 4)
X_opt = X[:, [0, 3, 5]]
regression_OLS = sm.OLS(endog = Y, exog = X_opt.tolist()).fit()
regression_OLS.summary()


#in this case x2 (from X_opt its column 5) is higher than SL = 0.05
#and also the higher value from the list with a P value = 0.060

0,1,2,3
Dep. Variable:,y,R-squared:,0.959
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,504.0
Date:,"Thu, 13 Jul 2023",Prob (F-statistic):,1.4299999999999999e-30
Time:,13:09:51,Log-Likelihood:,-473.8
No. Observations:,46,AIC:,953.6
Df Residuals:,43,BIC:,959.1
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.186e+04,2581.289,20.090,0.000,4.67e+04,5.71e+04
x1,0.7581,0.036,21.081,0.000,0.686,0.831
x2,0.0258,0.014,1.828,0.074,-0.003,0.054

0,1,2,3
Omnibus:,0.069,Durbin-Watson:,1.624
Prob(Omnibus):,0.966,Jarque-Bera (JB):,0.268
Skew:,0.03,Prob(JB):,0.875
Kurtosis:,2.631,Cond. No.,629000.0


In [84]:
#make the proces again without x2 (from X_opt its column 5)
X_opt = X[:, [0, 3]]
regression_OLS = sm.OLS(endog = Y, exog = X_opt.tolist()).fit()
regression_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.956
Model:,OLS,Adj. R-squared:,0.955
Method:,Least Squares,F-statistic:,953.8
Date:,"Thu, 13 Jul 2023",Prob (F-statistic):,1.84e-31
Time:,13:09:52,Log-Likelihood:,-475.52
No. Observations:,46,AIC:,955.0
Df Residuals:,44,BIC:,958.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.411e+04,2328.726,23.235,0.000,4.94e+04,5.88e+04
x1,0.8046,0.026,30.884,0.000,0.752,0.857

0,1,2,3
Omnibus:,0.106,Durbin-Watson:,1.492
Prob(Omnibus):,0.948,Jarque-Bera (JB):,0.311
Skew:,0.038,Prob(JB):,0.856
Kurtosis:,2.604,Cond. No.,185000.0


In [170]:
#We can conclude that the variable in column 3 (for X_opt, which is marketing expends) 
#is the one that best predicts a company's profit.

"Automated implementation of Backward Elimination in Python"

In [None]:
import statsmodels.formula.api as sm
def backwardElimination(x, sl):    
    numVars = len(x[0])    
    for i in range(0, numVars):        
        regressor_OLS = sm.OLS(y, x.tolist()).fit()        
        maxVar = max(regressor_OLS.pvalues).astype(float)        
        if maxVar > sl:            
            for j in range(0, numVars - i):                
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):                    
                    x = np.delete(x, j, 1)    
    regressor_OLS.summary()    
    return x 
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

In [86]:
#obtain the Predict_Profit with the new model
Y_Pred = regression_OLS.predict(X_opt)
df["Predict_Profit"] = Y_Pred

#obtain the Profit_Growth of each company
df["Profit_Growth"] = ((df["Predict_Profit"] - df["Profit"]) / df["Profit"]) * 100
df.head(3)

Unnamed: 0,StartUp,R&D Spend,Administration,Marketing Spend,State,Profit,Predict_Profit,Profit_Growth
0,NovoTech,165349.2,136897.8,471784.1,New York,192261.83,187152.485662,-2.657493
1,Voxify,162597.7,151377.59,443898.53,California,191792.06,184938.569363,-3.573396
2,ZephyrTech,153441.51,101145.55,407934.54,Florida,191050.39,177571.301226,-7.055253


In [87]:
#Obtain the top 3 Profit_Growth companies
df['Predict_Profit'] = df['Predict_Profit'].astype(float)
df['Profit_Growth'] = df['Profit_Growth'].astype(float)

df['Predict_Profit'] = df['Predict_Profit'].round(2)
df['Profit_Growth'] = df['Profit_Growth'].round(2)

df_top_3 = df.nlargest(3, "Profit_Growth")
df_top_3

Unnamed: 0,StartUp,R&D Spend,Administration,Marketing Spend,State,Profit,Predict_Profit,Profit_Growth
14,ViraTech,119943.24,156547.42,256512.92,Florida,132602.65,150617.87,13.59
15,DynaSoft,114523.61,122616.84,261776.23,New York,129917.04,146257.12,12.58
46,RoboGenius,1315.46,115816.21,297114.46,Florida,49490.75,55167.4,11.47


In [88]:
#Save the data frame into a CSV file
df.to_csv("C:/Users/willi/OneDrive/Escritorio/Proyectos/Mahine_Learning_Project/Datasets/Ob_2_Startups_Data_2.csv", index=False)
