# Continuous Methods

In [None]:
# imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from pydataset import data

### Visualize:

### Ascombe's Quartet:

In [None]:
url = "https://gist.githubusercontent.com/ryanorsinger/6218f5731f3df7cb4771ff3879fdeaa3/raw/88cb6bed276e2236c33df011bd753b6c73832c30/quartet.csv"

df = pd.read_csv(url)

In [None]:
# peek at the head, what this data set looks like
df.head()

In [None]:
df.groupby('dataset').describe()

In [None]:
sns.relplot(x='x', y='y', col='dataset', data=df)

In [None]:
# load up the swiss data set

In [None]:
df = data('swiss')

In [None]:
df.head()

In [None]:
# get the stats via describe

In [None]:
data('swiss', show_doc=True)

In [None]:
df.describe()

In [None]:
# histograms

In [None]:
for col in df.columns:
    if np.issubdtype(df[col].dtype, np.number):
        df[col].hist()
        plt.title(col)
        plt.show()
        sns.boxplot(data=df, x=col)
        plt.show()
        print('--------')

In [None]:
sns.pairplot(df)

### Using z-score

### Exponential Tails of the Normal Distribution
| Range | Expected Fraction of Population Inside Range | Approximate Expected Frequency Outside Range | Approximate frequency for daily event |
| ----- | -------------------------------------------- | -------------------------------------------- | -------------------------------------------- |
| μ ± 0.5σ | 0.382924922548026                            | 2 in 3 |Four or five times a week|
| μ ± σ | 0.682689492137086 | 1 in 3 |Twice a week|
| μ ± 1.5σ | 0.866385597462284 | 1 in 7 |Weekly|
| μ ± 2σ | 0.954499736103642 | 1 in 22 |Every three weeks|
| μ ± 2.5σ | 0.987580669348448 | 1 in 81 |Quarterly|
| μ ± 3σ | 0.997300203936740 | 1 in 370 |Yearly|
| μ ± 3.5σ | 0.999534741841929 | 1 in 2149 |Every 6 years|
| μ ± 4σ | 0.999936657516334 | 1 in 15,787 |Every 43 years|
| μ ± 4.5σ | 0.999993204653751 | 1 in 147,160 |Every 403 years|
| μ ± 5σ | 0.999999426696856 | 1 in 744,278 |Every 4776 years|
| μ ± 5.5σ | 0.999999962020875 | 1 in 26,330,254 |Every 72,090 years|
| μ ± 6σ | 0.999999998026825 | 1 in 506,797,346 |Every 1.38 million years|

### Using IQR

In [None]:
# steps to defining IQR/Tukey method:
# get the Q1 and Q3 values
# determine our multiplier
# use these qualities to assert abnormalities

In [None]:
# Let's examine examination

In [None]:
df.Examination.quantile(0.25)

In [None]:
# start with an inner fence calculation
multiplier = 1.5
# calculate our q1 and q3
q1 = df.Examination.quantile(0.25)
q3 = df.Examination.quantile(0.75)
iqr = q3 - q1

In [None]:
q1, q3, iqr

In [None]:
# inner or outer: 1.5 fence multiplier convention for inner, 3.0 mult convention for outer
# lower: q1 - mult* iqr
# upper: q3 + iqr*mult

In [None]:
inner_lower_fence = q1 - (multiplier * iqr)
inner_upper_fence = q3 + (multiplier * iqr)

In [None]:
df[(df['Examination'] < inner_lower_fence) |  (df['Examination'] > inner_upper_fence)]

In [None]:
# z-score:
# subtract the data point from the mean, divide by the standard deviation

In [None]:
# Let's switch to Infant Mortality

In [None]:
# remember our z score calculation:
#  (x - x_mean) / x_std

In [None]:
z_scores = (df['Infant.Mortality'] - df['Infant.Mortality'].mean()) / df['Infant.Mortality'].std()

In [None]:
z_scores

In [None]:
df['infant_mortality_zscores'] = z_scores

In [None]:
# do the same thing for z scores but with examination again

In [None]:
df[df['infant_mortality_zscores'].abs() >= 3]

In [None]:
z_scores = (df['Examination'] - df['Examination'].mean()) / df['Examination'].std()

In [None]:
df['examination_zscores'] = z_scores

In [None]:
df[df['examination_zscores'].abs() >= 2]

In [None]:
multiplier = 3
q1 = df['Infant.Mortality'].quantile(0.25)
q3 = df['Infant.Mortality'].quantile(0.75)
iqr = q3 - q1

In [None]:
outer_lower_fence = q1 - (multiplier * iqr)
outer_upper_fence = q3 + (multiplier * iqr)

In [None]:
df[(df['Infant.Mortality'] < outer_lower_fence) | (df['Infant.Mortality'] > outer_upper_fence)]