In [6]:
import seaborn as sns
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np

In [3]:
mpg = sns.load_dataset('mpg')

In [4]:
# display first a few rows
mpg.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [9]:
mpg['mpg'].mean() # calculate mean

23.514572864321615

In [10]:
mpg['mpg'].std() # calculate standard deviation

7.815984312565782

In [11]:
mpg['mpg'].median() # calculate median

23.0

In [12]:
mpg.describe() # show the basic stats for continous variable columns

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [13]:
# drop missing values
mpg.dropna(inplace=True)

In [14]:
mpg.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [15]:
mpg['origin'].unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [16]:
# model building and fitting
formula = 'mpg ~ horsepower + weight + C(origin)'
model = smf.ols(formula=formula, data=mpg)
result = model.fit()

In [17]:
result.summary()
# interpretation of the result

# for continous variables
# if horsepower increase by 1, mpg will decrease by 0.0535 (-0.0535)
# if weight increases by 1 pound, mpg will decrease by 0.0048 (-0.0048)
# both coefficient are significant (p<0.05)

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.719
Model:,OLS,Adj. R-squared:,0.716
Method:,Least Squares,F-statistic:,247.9
Date:,"Wed, 21 Jul 2021",Prob (F-statistic):,2.44e-105
Time:,18:49:25,Log-Likelihood:,-1112.2
No. Observations:,392,AIC:,2234.0
Df Residuals:,387,BIC:,2254.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,43.7011,0.932,46.871,0.000,41.868,45.534
C(origin)[T.japan],1.7811,0.696,2.558,0.011,0.412,3.150
C(origin)[T.usa],-0.9611,0.640,-1.501,0.134,-2.220,0.298
horsepower,-0.0535,0.011,-4.875,0.000,-0.075,-0.032
weight,-0.0048,0.001,-8.878,0.000,-0.006,-0.004

0,1,2,3
Omnibus:,35.026,Durbin-Watson:,0.914
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.308
Skew:,0.658,Prob(JB):,5.33e-11
Kurtosis:,4.078,Cond. No.,15400.0


In [None]:
# for categorical variables

# if a car is a European car
# mpg = 43.7 - 0.0535 horsepower - 0.0048 weight

# if a car if a Japanese car
# mpg = 43.7 - 0.0535 horsepower - 0.0048 weight + 1.7811

# if car is a USA car
# mpg = 43.7 - 0.0535 horsepower - 0.0048 weight (because coefficient for being USA car is not significant)