# Chapter 04 Numerical Descriptive Measures

## Example -- Iris
* http://archive.ics.uci.edu/ml/datasets/Iris
* This is perhaps the best known database to be found in the pattern recognition literature. 
* The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. 
* One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.


### Load the iris dataset

In [1]:
# Import modules
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the iris data
iris = pd.read_csv('examples\iris.data')
iris

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
# Load the iris data
iris = pd.read_csv('examples\iris.data',header=None)
iris.columns=['sepal_length','sepal_width','petal_length','petal_width','species']
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [4]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [6]:
iris.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
54,6.5,2.8,4.6,1.5,Iris-versicolor
122,7.7,2.8,6.7,2.0,Iris-virginica
139,6.9,3.1,5.4,2.1,Iris-virginica
117,7.7,3.8,6.7,2.2,Iris-virginica
32,5.2,4.1,1.5,0.1,Iris-setosa
149,5.9,3.0,5.1,1.8,Iris-virginica
137,6.4,3.1,5.5,1.8,Iris-virginica
91,6.1,3.0,4.6,1.4,Iris-versicolor
130,7.4,2.8,6.1,1.9,Iris-virginica
20,5.4,3.4,1.7,0.2,Iris-setosa


In [7]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [8]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [11]:
iris.drop_duplicates(['species'])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
50,7.0,3.2,4.7,1.4,Iris-versicolor
100,6.3,3.3,6.0,2.5,Iris-virginica


### Central Tendency

#### mean
* Average value

In [12]:
np.mean(iris[:4])

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


sepal_length    4.825
sepal_width     3.200
petal_length    1.400
petal_width     0.200
dtype: float64

In [13]:
iris_outlier = iris.iloc[:,[0,1,2,3]]
iris_outlier.loc[150]=[100,100,100,100]
iris_outlier.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8
150,100.0,100.0,100.0,100.0


In [14]:
np.mean(iris_outlier)

sepal_length    6.466887
sepal_width     3.696026
petal_length    4.396026
petal_width     1.852980
dtype: float64

### median
* In an ordered array, the median is the “middle” number (50% above, 50% below)

In [15]:
np.median(iris['sepal_length'])

5.8

In [16]:
np.median(iris['sepal_width'])

3.0

In [17]:
np.median(iris['petal_length'])

4.35

In [18]:
np.median(iris['petal_width'])

1.3

In [19]:
np.median(iris_outlier['sepal_length'])

5.8

In [20]:
np.median(iris_outlier['sepal_width'])

3.0

In [21]:
np.median(iris_outlier['petal_length'])

4.4

In [22]:
np.median(iris_outlier['petal_width'])

1.3

### mode
* Value that occurs most often

In [23]:
from scipy import stats
stats.mode(iris['sepal_length'])

ModeResult(mode=array([5.]), count=array([10]))

In [24]:
stats.mode(iris['sepal_width'])

ModeResult(mode=array([3.]), count=array([26]))

In [25]:
stats.mode(iris['petal_length'])

ModeResult(mode=array([1.5]), count=array([14]))

In [26]:
stats.mode(iris['petal_width'])

ModeResult(mode=array([0.2]), count=array([28]))

In [27]:
stats.mode(iris_outlier['sepal_length'])

ModeResult(mode=array([5.]), count=array([10]))

In [28]:
stats.mode(iris_outlier['sepal_width'])

ModeResult(mode=array([3.]), count=array([26]))

In [29]:
stats.mode(iris_outlier['sepal_width'])

ModeResult(mode=array([3.]), count=array([26]))

In [30]:
stats.mode(iris_outlier['petal_width'])

ModeResult(mode=array([0.2]), count=array([28]))

###  Histogram 

In [31]:
%matplotlib notebook
sns.distplot(iris["sepal_length"])



<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='sepal_length', ylabel='Density'>

In [32]:
# Add the line of mean, median and mode
plt.axvline(np.mean(iris['sepal_length']), ls = '-', color = 'r', label = "mean")
plt.axvline(np.median((iris['sepal_length'])), ls = '-', color = 'g', label = "median")
plt.axvline(stats.mode((iris['sepal_length']))[0], ls = '-', color = 'k', label = "mode")
plt.legend()

<matplotlib.legend.Legend at 0x270ec871cd0>

### Measures of Variation

#### range
* Difference between the largest and the smallest values

In [33]:
np.ptp(iris['petal_length'])

5.9

In [34]:
np.ptp(iris['petal_width'])

2.4

In [35]:
np.ptp(iris['sepal_length'])

3.6000000000000005

In [36]:
np.ptp(iris['sepal_width'])

2.4000000000000004

#### Variance
* Average (approximately) of squared deviations of values from the mean

In [37]:
np.var(iris['petal_length'])

3.0924248888888854

In [38]:
np.var(iris['petal_width'])

0.5785315555555559

In [39]:
np.var(iris['sepal_length'])

0.6811222222222222

In [40]:
np.var(iris['sepal_width'])

0.1867506666666667

#### Standard Deviation

In [41]:
np.std(iris['petal_length'])

1.7585291834055201

In [42]:
np.std(iris['petal_width'])

0.760612618588172

In [43]:
np.std(iris['sepal_length'])

0.8253012917851409

In [44]:
np.std(iris['sepal_width'])

0.4321465800705435

#### Coefficient of Variation
* Measures relative variation
* Can be used to compare the variability of two or more sets of data measured in different units 

In [45]:
## Suppose the length of petal is in millimeter units, however, the orginial length of petal is in centimeter
iris['petal_length']*10

0      14.0
1      14.0
2      13.0
3      15.0
4      14.0
       ... 
145    52.0
146    50.0
147    52.0
148    54.0
149    51.0
Name: petal_length, Length: 150, dtype: float64

In [47]:
np.std(iris['petal_length'])# in the unit of centimeter

1.7585291834055201

In [48]:
np.std(iris['petal_length']*10)# in the unit of millimeter

17.58529183405521

It makes no sense to compare these two value, becasue they are in different units.

In [49]:
CV_petal=np.std(iris['petal_length'])/np.mean(iris['petal_length'])
CV_petal

0.46785983950129095

In [50]:
CV_petal_m=np.std(iris['petal_length']*100)/np.mean(iris['petal_length']*100)
CV_petal_m

0.46785983950129145

In [51]:
CV_sepal=np.std(iris['sepal_length'])/np.mean(iris['sepal_length'])
CV_sepal

0.1412380989934639

#### z-score
* The Z-score is the number of standard deviations a data value is from the mean.

In [52]:
from sklearn import preprocessing
iris_z=pd.DataFrame()
iris_z['sepal_length']=preprocessing.scale(iris['sepal_length'])
iris_z['sepal_width']=preprocessing.scale(iris['sepal_width'])
iris_z['petal_length']=preprocessing.scale(iris['petal_length'])
iris_z['petal_width']=preprocessing.scale(iris['petal_width'])
iris_z.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,-2.775558e-16,-5.140333e-16,1.154632e-16,9.251859e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.438987,-1.568735,-1.44445
25%,-0.9006812,-0.5877635,-1.227541,-1.181504
50%,-0.05250608,-0.1249576,0.3362659,0.1332259
75%,0.6745011,0.5692513,0.7627586,0.7905908
max,2.492019,3.114684,1.786341,1.710902


In [53]:
iris[iris_z['sepal_width']>3]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
15,5.7,4.4,1.5,0.4,Iris-setosa


In [54]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


#### Quartile Measures
* Quartiles split the ranked data into 4 segments with an equal number of values per segment

In [55]:
iris['sepal_length'].quantile(0.25)

5.1

In [56]:
iris['sepal_length'].quantile(0.5)

5.8

In [57]:
iris['sepal_length'].quantile(0.75)

6.4

#### Interquartile Range (IQR)
* The IQR is Q3 – Q1 and measures the spread in the middle 50% of the data

In [58]:
IQR_sepal_length = iris['sepal_length'].quantile(0.75)-iris['sepal_length'].quantile(0.25)
IQR_sepal_length

1.3000000000000007

#### Boxplot
* A Graphical display of the data based on the five-number summary

In [59]:
sns.countplot(x="species",data=iris)

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='species', ylabel='count'>

In [60]:
sns.barplot(x='species',y='petal_length',data=iris)

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='species', ylabel='petal_length'>

In [61]:
sns.boxplot(x='species',y='petal_length',data=iris)

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='species', ylabel='petal_length'>

#### Shape: skewness

In [62]:
iris_vir=iris[iris.species == 'Iris-virginica']
iris_s=iris[iris.species == 'Iris-setosa']
iris_ver=iris[iris.species =='Iris-versicolor']

In [63]:
sns.distplot(iris_vir['petal_width'],label='vir').set(ylim=(0,15))
sns.distplot(iris_s['petal_width'],label='s')
sns.distplot(iris_ver['petal_width'],label='ver')
plt.legend()



<IPython.core.display.Javascript object>



<matplotlib.legend.Legend at 0x270ede02c70>

In [64]:
stats.skew(iris_vir['petal_width'])
# left-skewed/negative skewed

-0.1255597931582545

In [65]:
stats.skew(iris_s['petal_width'])
# right-skewed/positive skewed

1.1610221111253474

In [66]:
stats.skew(iris_ver['petal_width'])
# approximately symmetric

-0.030236304298168936

#### Covariance
* The covariance measures the strength of the linear relationship between two numerical variables
* It is not possible to determine the relative strength of the relationship from the size of the covariance

In [67]:
np.cov(iris['petal_length'],iris['petal_width'])

array([[3.11317942, 1.29638747],
       [1.29638747, 0.58241432]])

#### Coefficient of Correlation
* Measures the relative strength of the linear relationship between two numerical variables

In [68]:
np.corrcoef(iris['petal_length'],iris['petal_width'])

array([[1.       , 0.9627571],
       [0.9627571, 1.       ]])

In [69]:
corr = iris.corr()
corr

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.109369,0.871754,0.817954
sepal_width,-0.109369,1.0,-0.420516,-0.356544
petal_length,0.871754,-0.420516,1.0,0.962757
petal_width,0.817954,-0.356544,0.962757,1.0


In [70]:
sns.heatmap(corr)

<IPython.core.display.Javascript object>

<AxesSubplot:>