In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
path = "50_Startups.csv"
df  = pd.read_csv(path)

In [3]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


Part # 2: Applying hot encoding on state(Categorical Variable)

Check unique values(classes) of state

In [4]:
df['State'].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [5]:
dummies_of_states = pd.get_dummies(df.State, prefix='State')
print(dummies_of_states.head())

   State_California  State_Florida  State_New York
0                 0              0               1
1                 1              0               0
2                 0              1               0
3                 0              0               1
4                 0              1               0


In [7]:
df_with_one_hot = pd.concat([df, dummies_of_states], axis='columns')
df_with_one_hot.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,New York,192261.83,0,0,1
1,162597.7,151377.59,443898.53,California,191792.06,1,0,0
2,153441.51,101145.55,407934.54,Florida,191050.39,0,1,0
3,144372.41,118671.85,383199.62,New York,182901.99,0,0,1
4,142107.34,91391.77,366168.42,Florida,166187.94,0,1,0


In [9]:
X = df_with_one_hot.drop(["State", "Profit"], axis = "columns")
y = df_with_one_hot["Profit"]
print("shape of X = ", X.shape)
print("shape of y = ", y.shape)
X

shape of X =  (50, 6)
shape of y =  (50,)


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0
5,131876.9,99814.71,362861.36,0,0,1
6,134615.46,147198.87,127716.82,1,0,0
7,130298.13,145530.06,323876.68,0,1,0
8,120542.52,148718.95,311613.29,0,0,1
9,123334.88,108679.17,304981.62,1,0,0


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state=51)
print("shape of X_train = ", X_train.shape)
print("shape of y_train = ", y_train.shape)
print("shape of X_test = ", X_test.shape)
print("shape of y_test = ", y_test.shape)

shape of X_train =  (40, 6)
shape of y_train =  (40,)
shape of X_test =  (10, 6)
shape of y_test =  (10,)


In [11]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [12]:
lr.fit(X_train,y_train)

LinearRegression()

In [13]:
y_test

4     166187.94
48     35673.41
47     42559.73
15    129917.04
0     192261.83
17    125370.37
29    101004.64
31     97483.56
19    122776.86
2     191050.39
Name: Profit, dtype: float64

In [14]:
y_pred  = lr.predict(X_test)
y_pred = np.round_(y_pred, decimals = 2)
y_pred

array([172564.19,  55000.78,  47168.79, 148740.09, 193427.1 , 131760.33,
       104168.94, 100177.42, 118849.46, 182050.58])

In [17]:
pd.DataFrame(np.c_[X_test, y_test, y_pred], columns = ["R&D Spend", "Administration", "Marketing Spend", "State_California", "State_Florida",	"State_New York", "Actual Profit", "Profit Predicted"])

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York,Actual Profit,Profit Predicted
0,142107.34,91391.77,366168.42,0.0,1.0,0.0,166187.94,172564.19
1,542.05,51743.15,0.0,0.0,0.0,1.0,35673.41,55000.78
2,0.0,135426.92,0.0,1.0,0.0,0.0,42559.73,47168.79
3,114523.61,122616.84,261776.23,0.0,0.0,1.0,129917.04,148740.09
4,165349.2,136897.8,471784.1,0.0,0.0,1.0,192261.83,193427.1
5,94657.16,145077.58,282574.31,0.0,0.0,1.0,125370.37,131760.33
6,65605.48,153032.06,107138.38,0.0,0.0,1.0,101004.64,104168.94
7,61136.38,152701.92,88218.23,0.0,0.0,1.0,97483.56,100177.42
8,86419.7,153514.11,0.0,0.0,0.0,1.0,122776.86,118849.46
9,153441.51,101145.55,407934.54,0.0,1.0,0.0,191050.39,182050.58


In [18]:
lr.score(X_test,y_test)

0.964331969195485