# Use-case: An invester company has assigned you a project. The goal of the project is to create a model that can predict the profit of the company based on Company's Spending Pattern and Company's Location

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('50_Startups.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [9]:
#Make the data compatible
finalData = pd.concat([pd.get_dummies(data.State, dtype=int), data.iloc[:,[0,1,2,4]]] , axis = 1)
finalData.head()
#pd.get_dummies(pd.Series(list('abc')), dtype=float)

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


In [10]:
features = finalData.iloc[:,:-1].values
label = finalData.iloc[:,[-1]].values

# Feature Engineering Track

# Method 1. Correlation Analysis

In [11]:
finalData.corr()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
California,1.0,-0.492366,-0.515152,-0.143165,-0.015478,-0.168875,-0.145837
Florida,-0.492366,1.0,-0.492366,0.105711,0.010493,0.205685,0.116244
New York,-0.515152,-0.492366,1.0,0.039068,0.005145,-0.03367,0.031368
R&D Spend,-0.143165,0.105711,0.039068,1.0,0.241955,0.724248,0.9729
Administration,-0.015478,0.010493,0.005145,0.241955,1.0,-0.032154,0.200717
Marketing Spend,-0.168875,0.205685,-0.03367,0.724248,-0.032154,1.0,0.747766
Profit,-0.145837,0.116244,0.031368,0.9729,0.200717,0.747766,1.0


In [None]:
#Rule By Prashant Nair:
# Select those features who have corr val greater than 50% (0.5)

# Features Selected: R&d, Mkg
# Features Eliminated: California, Florida, NewYork, Admiinistration

# Method 2. Backward Elimination Technique using OLS

In [12]:
#For detailed step check my whiteboard notes
# Step1: Perform All in

featuresAllIn = np.append(np.ones((50,1)).astype(int) , features, axis = 1 )

In [None]:
# Step2: Decide the SL
# SL = 0.05

In [13]:
#Step3: Perform OLS

import statsmodels.regression.linear_model as stat

# endog --- label column --- label numpy array
# exog ---- Iteration1: All In feature column, Iteration X: New Feature Column
# (OLS(endog,exog))

#?stat.OLS


#Iteration 1:

model = stat.OLS(endog=label, exog=featuresAllIn).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 25 Jul 2023",Prob (F-statistic):,1.34e-27
Time:,12:49:52,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.763e+04,5073.636,7.417,0.000,2.74e+04,4.79e+04
x1,1.249e+04,2449.797,5.099,0.000,7554.868,1.74e+04
x2,1.269e+04,2726.700,4.654,0.000,7195.596,1.82e+04
x3,1.245e+04,2486.364,5.007,0.000,7439.285,1.75e+04
x4,0.8060,0.046,17.369,0.000,0.712,0.900
x5,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x6,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1.72e+17


In [None]:
# Step4:

# Const --- Intercept Feature
# x1    --- California
# x2.   --- Florida
# x3    --- NY
# x4    --- R&D
# x5.   --- Adm
# x6.   --- Markg

# We select Admin = 0.608

In [14]:
# Step5: Since Admin Pvalue > SL(0.05)
# Therefore, eliminate Admin

newFeatureCol = featuresAllIn[:,[0,1,2,3,4,6]]

In [15]:
#Step3: Perform OLS

import statsmodels.regression.linear_model as stat

# endog --- label column --- label numpy array
# exog ---- Iteration1: All In feature column, Iteration X: New Feature Column
# (OLS(endog,exog))

#?stat.OLS


#Iteration 2:

model = stat.OLS(endog=label, exog=newFeatureCol).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,215.8
Date:,"Tue, 25 Jul 2023",Prob (F-statistic):,9.720000000000001e-29
Time:,12:50:02,Log-Likelihood:,-525.53
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1071.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.525e+04,2100.376,16.782,0.000,3.1e+04,3.95e+04
x1,1.171e+04,1910.312,6.130,0.000,7861.854,1.56e+04
x2,1.185e+04,2170.903,5.459,0.000,7477.785,1.62e+04
x3,1.169e+04,1988.428,5.879,0.000,7684.996,1.57e+04
x4,0.7967,0.042,18.771,0.000,0.711,0.882
x5,0.0298,0.016,1.842,0.072,-0.003,0.062

0,1,2,3
Omnibus:,14.64,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.037
Skew:,-0.938,Prob(JB):,2.7e-05
Kurtosis:,5.565,Cond. No.,5.05e+17


In [None]:
# Step4:

# Const --- Intercept Feature
# x1    --- California
# x2.   --- Florida
# x3    --- NY
# x4    --- R&D
# x5.   --- Markg

# We select Markg = 0.072

In [None]:
# Step5: Since Markg Pvalue > SL(0.05)
# Therefore, eliminate Markg

newFeatureCol1 = featuresAllIn[:,[0,1,2,3,4]]

In [None]:
#Step3: Perform OLS

import statsmodels.regression.linear_model as stat

# endog --- label column --- label numpy array
# exog ---- Iteration1: All In feature column, Iteration X: New Feature Column
# (OLS(endog,exog))

#?stat.OLS


#Iteration 3:

model = stat.OLS(endog=label, exog=newFeatureCol1).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,272.4
Date:,"Sat, 30 May 2020",Prob (F-statistic):,2.76e-29
Time:,16:10:00,Log-Likelihood:,-527.35
No. Observations:,50,AIC:,1063.0
Df Residuals:,46,BIC:,1070.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.686e+04,1959.786,18.806,0.000,3.29e+04,4.08e+04
x1,1.189e+04,1956.677,6.079,0.000,7955.697,1.58e+04
x2,1.306e+04,2122.665,6.152,0.000,8785.448,1.73e+04
x3,1.19e+04,2036.022,5.847,0.000,7805.580,1.6e+04
x4,0.8530,0.030,28.226,0.000,0.792,0.914

0,1,2,3
Omnibus:,13.418,Durbin-Watson:,1.122
Prob(Omnibus):,0.001,Jarque-Bera (JB):,17.605
Skew:,-0.907,Prob(JB):,0.00015
Kurtosis:,5.271,Cond. No.,3.7e+17


In [None]:
#Step7:

featuresFinal = newFeatureCol1


In [None]:
#The above will become input to your modelling algo!!!

# Method 3: RFE (Recursive Feature Elimination ) Technique

In [None]:
# RFE can be applied only on some algorithms
#
# Regression: (Algorithm must have coeff variable output)
# 1. LinearRegression
# 2. SupportVectorRegression
# 3. DecisionTreeRegressor
# 4. RandomForestRegressor

# Classification: (Algorithm must have feature importance variable)
# 1. DecisionTreeClassifier
# 2. RandomForestClassifier

In [None]:
# Steps to apply RFE
# 1. Initialize the model algorithm
# 2. Apply RFE to model (ALL FEATURES AND LABEL)
# 3. Get Features with High Ranking (1,2,3,4,...) (Get features that has Rank 1. Sometimes Rank 2 is considered)

In [None]:
# 1. Initialize the model algorithm
from sklearn.linear_model import LinearRegression
modelLR = LinearRegression()

# 2. Apply RFE to model (ALL FEATURES AND LABEL)
from sklearn.feature_selection import RFE
selectFeaturesFromRFE = RFE(estimator=modelLR,
                           step=1) #Suggested Step by Prashant Nair

# Fit the data with RFE
selectFeaturesFromRFE.fit(features,label)

# 3. Get Features with High Ranking (1,2,3,4,...) (Get features that has Rank 1. Sometimes Rank 2 is considered)
# California,Florida,NY,R&D,Adm,Markg
print(selectFeaturesFromRFE.ranking_)

[1 1 1 2 3 4]


  y = column_or_1d(y, warn=True)


In [None]:
#Observation: State and R&D is shortlisted for Model Building

# Method 4: Select By Model

In [None]:
# Steps to apply SBM (All Model algo will work)
# 1. Initialize the model algorithm
# 2. Apply RFE to model (ALL FEATURES AND LABEL)
# 3. Get Features with High Support (True/False) 

In [None]:
# 1. Initialize the model algorithm
from sklearn.linear_model import LinearRegression
modelLR = LinearRegression()

# 2. Apply SBM to model (ALL FEATURES AND LABEL)
from sklearn.feature_selection import SelectFromModel
selectFeaturesFromSBM = SelectFromModel(modelLR) #Suggested Step by Prashant Nair

# Fit the data with SBM
selectFeaturesFromSBM.fit(features,label)

# 3. Get Features with True i.e. That has positive support (1,2,3,4,...) 
# California,Florida,NY,R&D,Adm,Markg
print(selectFeaturesFromSBM.get_support())

[ True  True  True False False False]


In [None]:
#Observation: Select California,Floriada and NY as finalFeatures

# Apply Outputs of Feature Engineering

In [None]:

featureCorrAnalysis = features[:,[3,5]]
featureBackwardEliminationOLS = features[:,[0,1,2,3]]
featureRFERank1n2= features[:,[0,1,2,3]]
featureRFERank1Only = features[:,[0,1,2]]
featureSBMTrueOnly = features[:,[0,1,2]]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

for i in range(1,51):
    X_train,X_test,y_train,y_test = train_test_split(featureCorrAnalysis, label, test_size=0.2, random_state = i)
    model1 = LinearRegression()
    model1.fit(X_train,y_train)
    
    train_score = model1.score(X_train,y_train)
    test_score = model1.score(X_test,y_test)
    
    if test_score > train_score and test_score >= CL:
        print("Test: {} , Train: {} , RS : {}".format(test_score,train_score,i))

Test: 0.1507109998333278 , Train: -0.06593244561790113 , RS : 6
Test: 0.12089159244307222 , Train: -0.01969795421804088 , RS : 10
Test: 0.015080875800281057 , Train: 0.0017731552271403883 , RS : 32
Test: 0.025328630963713517 , Train: 0.013479770737339636 , RS : 49


In [None]:
#Output: 99% / 0.99 achieved using Corr Analysis using LinearRegresion, RS=10
#Output: 98% / 0.98 achieved using Backward Elimination using OLS, RS=10
#Output: 98% achieved using RFE (Rank 1 and 2) , RS=10
#Output: 15% achieved using RFE(Rank1) using LinearRegression algo, RS=10
#Output: 15% achieved using SBM using LinearRegression algo, RS=10
