# Descriptive Statistics

In [None]:
import numpy as np
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt
%matplotlib inline

## Arithmetic mean
## $$ \mathbf{\mu_{x}} = E[X] = \frac{x_{1} + x_{2} ... x_{N}}{N}$$

In [None]:
# A sequence of 1M normally distributed numbers
X = np.random.normal(loc=0.25, scale=0.10, size=(1000000,))
_ = plt.hist(X, bins=50)

In [None]:
print(X.mean())

In [None]:
print(X.std())

## Why is this important?

In [None]:
from scipy.stats import probplot

In [None]:
probplot(X, (0.25, 10), plot=plt)

In [None]:
Y = np.random.poisson(size=(1000000,))
probplot(Y, (Y.mean(), Y.std()), plot=plt)
plt.figure()
_ = plt.hist(Y, bins=50)

## Mean as a function of the means of parts
1. ### Divide $X$ into $k$ equal-sized partitions, $\{p_{1}, p_{2}, p_{3} ... p_{k}\}$, where size of each partition is $n$
2. ### Then $E[X]$ is defined as the mean of the means of each partition, i.e.

$$ E[X] = \frac{E[p_{1}] + E[p_{2}] ... E[p_{k}]}{k} $$

$$ \therefore E[X] = \frac{\frac{p_{1}}{n} + \frac{p_{2}}{n} ... \frac{p_{k}}{n}}{k} $$

$$ \therefore E[X] = \frac{p_{1} + p_{2} ... p_{k}}{nk} $$

In [None]:
k = 1000

In [None]:
partitions = np.split(X, k)

In [None]:
meanMap = map(np.mean, partitions)

In [None]:
meanReduce = reduce(lambda x, y: x + y, meanMap) / k
print(meanReduce)

In [None]:
# Simpler implementation from the second example:

In [None]:
meanMap = map(np.sum, partitions)
meanReduce = reduce(lambda x, y: x + y, meanMap) / X.shape[0]
print(meanReduce)

## Exercise: Find the mean of the sequence $Z$ below.
## Use three different partition sizes, and check if the result changes
## Is it a Gaussian distribution?

In [None]:
Z = np.random.beta(2, 5, size=(1000000,))

In [None]:
# enter code here

## Mode & Median

### Median - midpoint of a _sorted_ sequence of values

In [None]:
np.median(Z)

In [None]:
middle = int(Z.shape[0] / 2)
sort_z = np.sort(Z)
left = sort_z[:middle]
right = sort_z[middle:]

median = (left[-1] + right[0]) / 2
print(median)

### Mode - the most frequently occuring value

In [None]:
from collections import Counter
counter = Counter(['apple', 'orange', 'banana', 'apple', 'apple', 'orange', 'banana', 'orange', 'banana', 'apple'])
counter

In [None]:
counter.most_common(1)

In [None]:
df = pd.read_csv('data/hwg.csv')
df.head()

In [None]:
df['Gender'].mode()

In [None]:
df['Gender'].value_counts()

## Variance and Standard Deviation
## $$ V[X] = E[(X - \mu)^2]$$
### where
## $$ \mu = E[X] $$

### The arithmetic mean is a linear operator, thus,
## $$ V = E[(X - \mu)^2]$$
## $$ \therefore V = E[X^2 - 2X\mu + \mu^2] $$
## $$ \therefore V = E[X^2] - 2E[X]\mu + \mu^2 $$
## $$ \therefore V = E[X^2] - 2\mu^2 + \mu^2 $$
## $$ \therefore V = E[X^2] - \mu^2 $$

In [None]:
mu = meanReduce

In [None]:
mu_sq = meanReduce ** 2

## Exercise: Calculate the variance of $\mathbf{x}$
### Hint: Use the following function to square the elements in a sequence:

In [None]:
def square_sequence(x):
    return [i ** 2 for i in x]

In [None]:
# enter code here

In [None]:
print(X.var())

## Exercise: Normalize the following dataset 
### Step 1: Centering - subtract the mean of each column from that column
### Step 2: Scaling - Divide the centered values of each column by the standard deviation of that column
### Hint: use the `np.mean` and `np.var` (or `np.std`) functions

In [None]:
df = pd.read_csv('data/hwg.csv')
X = df
df.head()

In [None]:
# enter code here