In [1]:
num_defects =  [99, 52, 42, 44, 27, 22, 22, 24, 22, 21,
                21, 18, 18, 15, 19, 19, 18, 17, 18, 16,
                16, 17, 18, 15, 15, 13, 11, 14, 14, 13,
                14, 14, 14, 13, 14, 14, 14, 14, 12, 14,
                13, 12, 11, 12, 13, 13, 12, 13, 11, 13,
                11, 12, 12, 10, 10, 11, 12, 13, 10, 12,
                11, 10, 12, 12, 11, 11, 9, 9, 9, 11,
                12, 8, 8, 10, 8, 10, 11, 10, 9, 10,
                10, 8, 8, 11, 10, 9, 8, 8, 10, 10,
                7, 7, 7, 8, 8, 7, 7, 8, 7, 7,
                9, 7, 7, 10, 7, 8, 9, 9, 6, 6,
                8, 8, 9, 7, 7, 9, 6, 8, 9, 8,
                9, 9, 6, 8, 9, 7, 8, 6, 8, 6,
                6, 6, 6, 6, 6, 8, 7, 8, 7, 5,
                6, 8, 8, 8, 8, 5, 4, 6, 6, 4,
                6, 6, 4, 5, 7, 5, 7, 6, 5, 4,
                6, 4, 7, 5, 6, 4, 4, 6, 6, 3,
                5, 6, 6, 4, 6, 3, 3, 3, 5, 5,
                5, 3, 2, 5, 2, 3, 2, 4, 3, 2,
                5, 2, 2, 2, 3, 4, 4, 4, 5, 5,
                3, 5, 4, 2]

In [3]:
# The following enables inline plots
# and import matplotlib.pyplot uses plt as an alias

%matplotlib inline
import matplotlib.pyplot as plt

In [4]:
# importing the Counter class helps us simplify the counting process

from collections import Counter

In [5]:
def data_range(x):
    """
    Returns the range (i.e. the difference) between the highest and lowest values
    :param x:
    :return:
    """

    return max(x) - min(x)

In [6]:
data_range([5, 3, 4, 2, 1])

4

In [7]:
data_range(num_defects)

97

In [8]:
def quantile(values, percentile):
    """
    Returns the pth-percentile value in a sequence of values
    :param values:
    :param percentile:
    :return:
    """

    p_index = int(len(values) * percentile)
    return sorted(values)[p_index]

In [9]:
# Given a sequence of values, we can calculate a given quantile.
# In this case, we calculate the quantile at the halfway mark (50% OR 0.5)
#     or roughly the median.

grades1 = [85, 87, 89, 55, 65, 67, 75, 80, 91, 96]        # ten values

quantile(grades1, .5)

85

In [10]:
grades2 = [85, 87, 89, 55, 65, 67, 75, 80, 91, 96, 99]    # eleven values

quantile(grades2, .5)

# NOTE: As described above, this function lacks some details
#     it is less detailed than even our median function
#     and does NOT account for the length of the dataset in
#     terms of odd/even num of values

85

In [11]:
# While not as sophisticated as our median calc,
#     we can generally calculate other quantiles, such as the
#     the 25th percentile
#     or the 75th percentile
#     i.e. the quartiles

print('25%: ', quantile(grades2, .25))
print('75%: ', quantile(grades2, .75))

# sorted for comparison: [55, 65, 67, 75, 80, 85, 87, 89, 91, 96, 99]

25%:  67
75%:  91


In [12]:
for percent in [0.10, 0.25, 0.75, 0.90, 0.999]:
    q = quantile(num_defects, percent)

    print('Percent: {}\t{}'.format(percent, q))

Percent: 0.1	4
Percent: 0.25	6
Percent: 0.75	12
Percent: 0.9	16
Percent: 0.999	99


In [17]:
def interquartile_range(values, upper_bound=0.75, lower_bound=0.25):
    """
    Return the difference between the 75% and 25% percentiles
    :param values:
    :return:
    """

    return quantile(values, upper_bound) - quantile(values, lower_bound)

In [14]:
values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Let's first just confirm the quantile VALUES at the 25% and 75% cut points
print(quantile(values, 0.25), '<->', quantile(values, 0.75))

3 <-> 8


In [15]:
# Now, let's calculate the interquartile range

print(interquartile_range(values))

5


In [16]:
# We can calculate the interquartile range for our defects data (presuming 75% and 25%)

interquartile_range(num_defects)

6

In [18]:
def mean(values):
    """
    Return the mean (or average) of a sequence of values.

    >>> mean([1, 2, 3, 4])
    2.5

    >>> mean([1, 2, 3, 4, 5])
    3.0

    """
    return sum(values) / len(values)

In [19]:
def diff_mean(values):
    """
    Calculate the difference from the mean for every value in a sequence of values
    :param values:
    :return:
    """

    x_bar = mean(values)
    return [value - x_bar for value in values]

In [20]:
def variance(values):
    """
    Return the variance of a sequence of values.

    NOTE:  this function presumes that values has a minimum of two elements.
    :param values:
    :return:
    """

    n = len(values)
    deviations = diff_mean(values)
    squared_diffs = [d ** 2 for d in deviations]
    sum_of_squares = sum(squared_diffs)

    return sum_of_squares / n