# Statistics and random numbers

In [None]:
%matplotlib inline
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

## Distribution and histogram

In [None]:
# Draw random samples from a normal (Gaussian) distribution
N = 1000
X = np.random.normal(size=N)
X[:10]

In [None]:
plt.figure(figsize=(10,3))
plt.plot(X, 'ko', markersize=2)
plt.ylim([-5,5])
plt.xlim([0,N])
plt.hlines(0, 0, N, color='red', linestyles='--')
plt.xlabel('sample ID')
plt.ylabel('value')

Observe the plot above. Do you observe a slight clustering of points toward $y=0$?

In [None]:
# visualize the histogram
bins = np.linspace(-4, 4, 9)
bins

The `density` parameter in the `histogram` function controls whether the result is normalized or not. 
See https://numpy.org/doc/stable/reference/generated/numpy.histogram.html

In [None]:
hist = np.histogram(X, bins=bins, density=True)[0]

# mid-points of bins for plotting
binsp = 0.5*(bins[1:] + bins[:-1])
print(binsp)
plt.plot(binsp, hist, 'ko')

bins2 = np.linspace(-4,4,51)
pdf = stats.norm.pdf(bins2)
plt.plot(bins2, pdf, 'r-')

**Exercise** Try different values of sample size $N$. For example, compare the results of $N=100$ and $N=1000$. Why does the result look "bad" (not falling on the red pdf curve of normal distribution) when $N$ is small?

We can also perform a maximum-likelihood fit of the data to estimate the parameters of the underlying distribution.

In [None]:
N = 1000
X = np.random.normal(size=N) + 1


Note this data set is **not** clustered around $y=0$:

In [None]:
plt.figure(figsize=(10,3))
plt.plot(X, 'ko', markersize=2)
plt.ylim([-5,5])
plt.xlim([0,N])
plt.hlines(0, 0, N, color='red')
plt.xlabel('sample ID')
plt.ylabel('value')

Estimate the normal distribution parameters:

In [None]:
loc, std = stats.norm.fit(X)
loc, std

In [None]:
# plot the histogram
bins = np.linspace(-5,5,11)
hist = np.histogram(X, bins=bins, density=True)[0]

binsp = 0.5*(bins[1:] + bins[:-1])
plt.plot(binsp, hist, 'ko')

# plot the pdf, shifted and scaled using (loc, std) from the fit
bins2 = np.linspace(-5,5,51)
pdf = stats.norm.pdf(bins2, loc, std)
plt.plot(bins2, pdf, 'r-')
plt.vlines(loc, 0, 0.5, linestyles='--', colors='gray', alpha=0.5)
plt.ylim([0, 0.45])

## Other distribution functions

The normal distribution is not the only distribution. Consider the gamma distribtion:

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gamma.html

The pdf of the gamma distribution

In [None]:
x = np.linspace(0,10,101)
plt.plot(x, stats.gamma.pdf(x, a=2), 'k-')

We can draw random samples that has gamma distribution

In [None]:
N=1000
X = np.random.gamma(2, 2, size=N)

In [None]:
plt.figure(figsize=(10,3))
plt.plot(X, 'ko', markersize=2)
plt.ylim([-2,20])
plt.xlim([0,N])
plt.hlines(0, 0, N, color='red', linestyles='--')
plt.xlabel('sample ID')
plt.ylabel('value')

Observe the the data is all positive but clustered towards $y=0$.

Now we can compute the histogram:

In [None]:
# bin edges
bins = np.linspace(0, 20, 21)
bins

In [None]:
hist = np.histogram(X, bins=bins, density=True)[0]

binsp = 0.5*(bins[1:] + bins[:-1])
plt.plot(binsp, hist, 'ko', label='samples')
x = np.linspace(0,20,101)

def gamma_distribution(x, scale=2, shape=2):
    import scipy.special as sps  
    return x**(shape-1)*(np.exp(-x/scale) / (sps.gamma(shape)*scale**shape))

plt.plot(x, gamma_distribution(x,2,2), 'r--', label='gamma distribution')
plt.legend()

Another way to plot the histogram:

In [None]:
bins = np.linspace(0, 20, 21)
c, b, ignored = plt.hist(X, bins, density=True)
plt.plot(x, gamma_distribution(x,2,2), 'r--', label='gamma distribution')
plt.xlim([0,20])