In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn import linear_model
import statsmodels.api as sm
from bokeh.plotting import *
from bokeh.io import output_notebook
from bokeh.charts import Scatter, show

In [2]:
output_notebook()

In [3]:
df = pd.read_csv('50-Startups.csv')

In [4]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,California,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,California,166187.94


In [5]:
dummies = pd.get_dummies(df['State']).rename(columns=lambda x: str(x))
df = pd.concat([df, dummies], axis=1)

In [6]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,California,New York
0,165349.2,136897.8,471784.1,New York,192261.83,0,1
1,162597.7,151377.59,443898.53,California,191792.06,1,0
2,153441.51,101145.55,407934.54,California,191050.39,1,0
3,144372.41,118671.85,383199.62,New York,182901.99,0,1
4,142107.34,91391.77,366168.42,California,166187.94,1,0


In [7]:
profit = df['Profit']

In [8]:
X1 = df[['R&D Spend', 'Administration', 'Marketing Spend', 'New York']]
X1 = sm.add_constant(X1)
model1 = sm.OLS(profit, X1)
results1 = model1.fit()
print(results1.summary())

                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.947
Method:                 Least Squares   F-statistic:                     218.4
Date:                Sun, 05 Mar 2017   Prob (F-statistic):           7.53e-29
Time:                        15:47:32   Log-Likelihood:                -525.25
No. Observations:                  50   AIC:                             1060.
Df Residuals:                      45   BIC:                             1070.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
const            5.042e+04   6653.545     

In [9]:
X2 = df[['R&D Spend', 'Marketing Spend', 'New York']]
X2 = sm.add_constant(X2)
model2 = sm.OLS(profit, X2)
results2 = model2.fit()
print(results2.summary())

                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     296.2
Date:                Sun, 05 Mar 2017   Prob (F-statistic):           4.44e-30
Time:                        15:47:36   Log-Likelihood:                -525.36
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      46   BIC:                             1066.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
const            4.772e+04   3018.340     

In [10]:
X3 = df[['R&D Spend', 'Marketing Spend']]
X3 = sm.add_constant(X3)
model3 = sm.OLS(profit, X3)
results3 = model3.fit()
print(results3.summary())

                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     450.8
Date:                Sun, 05 Mar 2017   Prob (F-statistic):           2.16e-31
Time:                        15:47:37   Log-Likelihood:                -525.54
No. Observations:                  50   AIC:                             1057.
Df Residuals:                      47   BIC:                             1063.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
const            4.698e+04   2689.933     

In [11]:
# Now we calculate the predicted values based on the independent variable YearsExperience
y_hat3 = results3.predict(X3)

In [12]:
p = figure()
p.scatter(df['Marketing Spend'], df['Profit'], fill_color='blue', size=10)
p.scatter(df['Marketing Spend'], y_hat3, fill_color='red', marker='triangle', size=10)
p.left[0].formatter.use_scientific = False
p.below[0].formatter.use_scientific = False
show(p)

In [13]:
X4 = df[['R&D Spend']]
X4 = sm.add_constant(X4)
model4 = sm.OLS(profit, X4)
results4 = model4.fit()
print(results4.summary())

                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     849.8
Date:                Sun, 05 Mar 2017   Prob (F-statistic):           3.50e-32
Time:                        15:47:55   Log-Likelihood:                -527.44
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      48   BIC:                             1063.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       4.903e+04   2537.897     19.320      0.0