In [6]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

from sklearn import preprocessing, neighbors, grid_search, cross_validation
import statsmodels.api as sm
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [7]:
#Dataset
df = pd.read_csv(os.path.join('..', 'datasets', 'abalone.csv'))
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [8]:
df.isnull().sum()

Sex              0
Length           0
Diameter         0
Height           0
WholeWeight      0
ShuckedWeight    0
VisceraWeight    0
ShellWeight      0
Rings            0
dtype: int64

In [9]:
df.corr()

Unnamed: 0,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
Length,1.0,0.986812,0.827554,0.925261,0.897914,0.903018,0.897706,0.55672
Diameter,0.986812,1.0,0.833684,0.925452,0.893162,0.899724,0.90533,0.57466
Height,0.827554,0.833684,1.0,0.819221,0.774972,0.798319,0.817338,0.557467
WholeWeight,0.925261,0.925452,0.819221,1.0,0.969405,0.966375,0.955355,0.54039
ShuckedWeight,0.897914,0.893162,0.774972,0.969405,1.0,0.931961,0.882617,0.420884
VisceraWeight,0.903018,0.899724,0.798319,0.966375,0.931961,1.0,0.907656,0.503819
ShellWeight,0.897706,0.90533,0.817338,0.955355,0.882617,0.907656,1.0,0.627574
Rings,0.55672,0.57466,0.557467,0.54039,0.420884,0.503819,0.627574,1.0


In [14]:
smf.ols(formula = 'Rings ~ ShellWeight + Diameter', data = df).fit().summary()

0,1,2,3
Dep. Variable:,Rings,R-squared:,0.394
Model:,OLS,Adj. R-squared:,0.394
Method:,Least Squares,F-statistic:,1357.0
Date:,"Tue, 11 Oct 2016",Prob (F-statistic):,0.0
Time:,20:31:07,Log-Likelihood:,-9770.0
No. Observations:,4177,AIC:,19550.0
Df Residuals:,4174,BIC:,19560.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,6.1651,0.246,25.036,0.000,5.682 6.648
ShellWeight,13.7802,0.657,20.972,0.000,12.492 15.068
Diameter,1.1705,0.922,1.270,0.204,-0.636 2.977

0,1,2,3
Omnibus:,1230.844,Durbin-Watson:,1.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3683.689
Skew:,1.522,Prob(JB):,0.0
Kurtosis:,6.45,Cond. No.,32.2


In [15]:
sex_df = pd.get_dummies(df.Sex, prefix = 'Sex')

In [16]:
sex_df.head()

Unnamed: 0,Sex_F,Sex_I,Sex_M
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [17]:
df = df.join([sex_df])

In [18]:
=df.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,...,ShellWeight,Rings,Sex_F,Sex_I,Sex_M
0,M,0.455,0.365,0.095,0.514,...,0.15,15,0.0,0.0,1.0
1,M,0.35,0.265,0.09,0.2255,...,0.07,7,0.0,0.0,1.0
2,F,0.53,0.42,0.135,0.677,...,0.21,9,1.0,0.0,0.0
3,M,0.44,0.365,0.125,0.516,...,0.155,10,0.0,0.0,1.0
4,I,0.33,0.255,0.08,0.205,...,0.055,7,0.0,1.0,0.0


In [47]:
smf.ols(formula = 'Rings ~ Height + Diameter + ShellWeight + ShuckedWeight + Sex_F + Sex_M', data = df).fit().summary()

0,1,2,3
Dep. Variable:,Rings,R-squared:,0.521
Model:,OLS,Adj. R-squared:,0.52
Method:,Least Squares,F-statistic:,755.4
Date:,"Tue, 11 Oct 2016",Prob (F-statistic):,0.0
Time:,20:59:12,Log-Likelihood:,-9279.8
No. Observations:,4177,AIC:,18570.0
Df Residuals:,4170,BIC:,18620.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,2.9014,0.247,11.741,0.000,2.417 3.386
Height,10.9850,1.559,7.048,0.000,7.929 14.041
Diameter,10.7414,0.997,10.772,0.000,8.787 12.696
ShellWeight,19.6491,0.655,29.994,0.000,18.365 20.933
ShuckedWeight,-11.6365,0.377,-30.876,0.000,-12.375 -10.898
Sex_F,0.8655,0.103,8.384,0.000,0.663 1.068
Sex_M,0.9201,0.097,9.493,0.000,0.730 1.110

0,1,2,3
Omnibus:,1056.536,Durbin-Watson:,1.419
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3338.694
Skew:,1.278,Prob(JB):,0.0
Kurtosis:,6.557,Cond. No.,59.9
