In [36]:
%matplotlib inline

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [37]:
carseat = pd.read_csv('Carseats.csv')

In [38]:
carseat.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [39]:
carseat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
Sales          400 non-null float64
CompPrice      400 non-null int64
Income         400 non-null int64
Advertising    400 non-null int64
Population     400 non-null int64
Price          400 non-null int64
ShelveLoc      400 non-null object
Age            400 non-null int64
Education      400 non-null int64
Urban          400 non-null object
US             400 non-null object
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [40]:
x_variables = list(carseat.columns[carseat.columns != 'Sales'])

In [41]:
model = smf.ols('Sales ~ Income:Advertising+Price:Age + ' + "+".join(x_variables),data = carseat)

In [42]:
results = model.fit()

In [43]:
results.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.876
Model:,OLS,Adj. R-squared:,0.872
Method:,Least Squares,F-statistic:,210.0
Date:,"Thu, 29 Aug 2019",Prob (F-statistic):,6.140000000000001e-166
Time:,17:00:03,Log-Likelihood:,-564.67
No. Observations:,400,AIC:,1157.0
Df Residuals:,386,BIC:,1213.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.5756,1.009,6.519,0.000,4.592,8.559
ShelveLoc[T.Good],4.8487,0.153,31.724,0.000,4.548,5.149
ShelveLoc[T.Medium],1.9533,0.126,15.531,0.000,1.706,2.201
Urban[T.Yes],0.1402,0.112,1.247,0.213,-0.081,0.361
US[T.Yes],-0.1576,0.149,-1.058,0.291,-0.450,0.135
Income:Advertising,0.0008,0.000,2.698,0.007,0.000,0.001
Price:Age,0.0001,0.000,0.801,0.424,-0.000,0.000
CompPrice,0.0929,0.004,22.567,0.000,0.085,0.101
Income,0.0109,0.003,4.183,0.000,0.006,0.016

0,1,2,3
Omnibus:,1.281,Durbin-Watson:,2.047
Prob(Omnibus):,0.527,Jarque-Bera (JB):,1.147
Skew:,0.129,Prob(JB):,0.564
Kurtosis:,3.05,Cond. No.,131000.0


In [44]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
lb.fit(carseat.Urban)
lb.fit(carseat.US)
urban_dummies = lb.fit_transform(carseat.Urban)
us_dummies = lb.fit_transform(carseat.US)
# you need to convert this back to a dataframe
urban_dummies_df = pd.DataFrame(urban_dummies, columns=["urban"])
us_dummies_df = pd.DataFrame(us_dummies, columns=["us"])


In [45]:
carseat = pd.concat([carseat, urban_dummies_df, us_dummies_df], axis=1)

In [46]:
carseat.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,urban,us
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes,1,1
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes,1,1
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes,1,1
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes,1,1
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No,1,0


In [47]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(carseat.ShelveLoc)
shelveloc_dummies = le.fit_transform(carseat.ShelveLoc)
shelveloc_dummies_df = pd.DataFrame(shelveloc_dummies,columns=["shelveloc"])

In [48]:
carseat = pd.concat([carseat, shelveloc_dummies_df], axis=1)

In [49]:
carseat.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,urban,us,shelveloc
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes,1,1,0
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes,1,1,1
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes,1,1,2
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes,1,1,2
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No,1,0,0


In [50]:
le.classes_

array(['Bad', 'Good', 'Medium'], dtype=object)