In [None]:
#Use-case: To create a model that can predict the profit of the company based on company's location and company's spending pattern.

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('50_Startups.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [None]:
data.State.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [None]:
# Check for Missing values using
data.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [None]:
#Check for Outliers - Only on Feature Columns if performing Inferential Stats else perform for all columns
#Never check outliers for Label Column if performing Supervised Learning
data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [None]:
#Inferential Stats 
# 1. Ensure your data is Complete
# 2. Ensure your data is Strictly Numeric
data.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
finalData = pd.concat([ pd.get_dummies(data['State']) , data.iloc[:,[0,1,2,4]]] , axis = 1)
finalData.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


In [None]:
# Inferential Stats 
# 1. Seperate data as features and label and store the same in the form on Numpy Array
# 2. Split our data as Training Set and Testing Set where Training Set is used for training/convergence purpose
#    whereas Testing set is used for Quality Check Purpose.

In [None]:
# 1. Seperate data as features and label and store the same in the form on Numpy Array

features = finalData.iloc[:,0:6].values
label = finalData.iloc[:,[6]].values

In [None]:
# 2. Split our data as Training Set and Testing Set where Training Set is used for training/convergence purpose
#    whereas Testing set is used for Quality Check Purpose.

# Step1: Decide the Split Ratio (65%:35%)(80:20)
# Step2: Implement the same using Sci-kit Learn package

In [None]:
#X_train --> Training Features
#y_train --> Training Label
#X_test  --> Testing Features
#y_test  --> Testing Label
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=10)

#For training ----------> X_train,y_train
#For QA ----------------> X_test,y_test

In [None]:
# Build the model
# Regression Algo ------> Linear Regression
#
#  y = mx + b   (Slope Intercept Formula)
#
#  profit = b0(California) + b1(Florida) + b2(New York)+ b3(R&D Spend) + b4(AdminSpend) + b5(MarketingSpend) + intercept
#
# Goal of this algo is to derive values of b0,b1,b2,b3,b4,b5,intercept based on the historical data !

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()


In [None]:
#Train the model | Converging Training Set to the Algorithm
# fit(features,label)
model.fit(X_train,y_train)

LinearRegression()

In [None]:
model.intercept_

array([50001.73604086])

In [None]:
model.coef_

array([[ 8.41023126e+01,  6.95447747e+02, -7.79550060e+02,
         8.05859453e-01, -1.79706621e-02,  2.28153524e-02]])

In [None]:
# Profit = 8.41023126e+01(California) + 6.95447747e+02(Florida) - 7.79550060e+02(New York) + 8.05859453e-01 (RDSpend) - 1.79706621e-02(Admin) + 2.28153524e-02(MarkSpend) + 50001.73604086

In [None]:
# Quality Check (Guideline by Prashant Nair)
# ========================================================================================
# 1. Ensure your model that is converged is a Generalized Model
#
# Generalized model is a model that performs well with Known and Unknown data
# 
# Technique : Accuracy(Test Data)  >  Accuracy(Train Data) ------> Model is Generalized Model
#
# 2. Ensure the Model's accuracy score must be greater than or equal to CL score (*CL = 1 - SL)
#
# Technique : Accuracy(Test Data) >= CL

In [None]:
# 1. Ensure your model that is converged is a Generalized Model

#In scikit learn you can get the accuracy score using score function

testAccuracy = model.score(X_test,y_test)
trainAccuracy = model.score(X_train,y_train)

print("Test Score is {} and train Score is {}".format(testAccuracy,trainAccuracy))

Test Score is 0.9901105113397691 and train Score is 0.9385918220043519


In [None]:
# As observed above, testScore > trainScore , thus the model is generalized !!!

In [None]:
# SL = 0.05
# CL = 0.95
# 2. Ensure the Model's accuracy score must be greater than or equal to CL score (*CL = 1 - SL)
#
# Technique : Accuracy(Test Data) >= CL

#Since my testScore is greater than CL, model passed the Quality Check !

In [None]:
# Deployment Test
#

rdSpend = float(input("Enter R&D Spend: "))
admSpend = float(input("Enter Administration Spend: "))
markSpend = float(input("Enter Marketing Spend: "))
state = input("Enter State: ")

refState = ['California', 'Florida','New York']

if state in refState:
  if state == "California":
    stateDummy = np.array([[1,0,0]])
  elif state == "Florida":
    stateDummy = np.array([[0,1,0]])
  else:
    stateDummy = np.array([[0,0,1]])

  finalFeatures = np.concatenate((stateDummy, np.array([[rdSpend,admSpend,markSpend]])) , axis=1)

  profit = model.predict(finalFeatures)[0][0]

  print("Predicted profit is $ {}".format(profit))


else:
  print("Model can't predict profit for the given {} state".format(state))

Enter R&D Spend: 234567
Enter Administration Spend: 45678
Enter Marketing Spend: 76543
Enter State: Florida
Predicted profit is $ 240650.70978691286
