In [17]:
import numpy as np
import scipy.stats as stats

# plotting libraries
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff


# 1. Random Variables and Distributions

## Definition of Random Variable
"A quantity having a numerical value for each member of a group, especially one whose values occur according to a frequency distribution." - Oxford

## Types of Random Variables
**Discrete Random Variables** - A RV whose possible values are finite or countably infinite (e.g., a dice roll)
<br><Br>
**Continuous Random Variables** - A RV where theres an infinite number of values (e.g., normal distribution)
<br>*Note - the probability at any given point in a Continous RV is 0*

## Functions of Random Variables

#### Discrete RVs

*Probability Mass Function (PMF)*
<br> 
$ f(x) = P(X=x)$, 
<br>
where $ \sum_{-\infty}^{\infty}f(x) = 1$
<br><br>

*Cumulative Disribution Function (CDF)*
<br>
$ F(x) = P(X \leq x) = \sum_{-\infty}^{x}f(x)$

#### Continuous RVs

*Probability Density Function (PDF)*
<br>
$ f(x) = P(X=x) = 0$
<br>
where $\int\limits_{-\infty}^\infty f(x) = 1$

<br>
*Cumulative Disribution Function (CDF)*
<br>
$ F(x) = P(X \leq x) = \int_{-\infty}^{x}f(x)$

#### Discrete Distributions - Probability Mass Function (PMF)

In [120]:
x = [0,1]
fig = px.scatter(x=x, 
                 y=stats.bernoulli.pmf(k=x, p=0.4),
                 title = 'PMF of the Bernoulli(p=0.4) Distribution')

fig.update_xaxes(range=[-0.5, 1.5])

#### Discrete Distributions - Cumulative Density Function (CDF)

In [137]:
x = np.linspace(-0.1, 1.1, 1000)
px.line(x=x, 
        y=stats.bernoulli.cdf(k=x, p=0.4),
        title = 'CDF of the Bernoulli(p=0.4) Distribution')

#### Continuous Distributions - Probability Density Function (PDF)

In [126]:
x = np.arange(-4, 4, 0.001)
px.line(x=x, 
        y=stats.norm.pdf(x),
        title = 'PDF of the Normal(0,1) Distribution')

#### Continuous Distributions - Cumulative Density Function (CDF) 

In [75]:
px.line(x=x, 
        y=stats.norm.cdf(x),
        title='CDF of the Normal(0,1) Distribution')

## Distributions - Sampling from Different RVs

##### Continuous Distributions

In [38]:
# Normal 
X = np.random.normal(0, 1, size=1000)

px.histogram(x=X, nbins=100)

In [143]:
# Uniform 
X = np.random.uniform(0, 1, size=1000)

px.histogram(x=X)

In [56]:
##### Discrete Distributions

In [144]:
# Discrete Uniform (e.g., a die toss)
X = np.random.uniform(0, 6, size=1000)
X = np.ceil(X)

px.histogram(x=X, nbins=6)

In [55]:
# Poisson 
X = np.random.poisson(1, size=1000)

px.histogram(x=X, nbins=100)

## The Normal Distribution

### Norm($\mu$, $\sigma^2\$)

$\mu$ = mean 
<br>
$\sigma^2$ = variance
<br>
$\sigma$ = standard deviation = $\sqrt {variance}$ 

In [162]:
mu = 3
sigma_squared = 16
sigma = np.sqrt(sigma_squared)

In [186]:
# pdf
x = np.arange(-10, 15, 0.001)
px.line(x=x, 
        y=stats.norm.pdf(x, loc=mu, scale=sigma),
        title = 'PDF of the Normal(0,1) Distribution')

In [187]:
# sampling
X = np.random.normal(loc=mu, scale=sigma, size=1000)

px.histogram(x=X, nbins=100)

In [188]:
ff.create_distplot([X], group_labels=['Samples from Normal Dist.'], show_rug=False)

## Standardized Normal Distribtion

### Z($\mu=0$, $\sigma^2=1$)

In [172]:
Z = (X-mu)/np.sqrt(sigma_squared)

px.histogram(x=Z, nbins=100)

## T-Distribution

### T($t$, $\nu$)

# 2. Measures of Central Tendency (Estimates of Location)

## Intro to Central Tendency

In [2]:
data = [2, 3, 3, 4, 7, 9, 12]

$Median$ is the "middle" value of the data set (i.e., the 50% percentile).
<br>
Median is more robust to outliers, as outliers don't "pull" on the median as much as they do the mean. 

In [212]:
median = np.median(data)
# or 
median = np.quantile(data, q=.5)

print(median)

4.0


$ Mean = \bar{x} = \frac{\sum_{i=1}^{n} x_i}{n} $
<br>
The mean is probably the best known estimator for location.

In [211]:
mean = np.mean(data)
print(mean)

5.714285714285714


$ Variance = \sigma^2 = \sum_{i=1}^{n} \frac{(x_i - \bar{x})^2}{n} $

In [210]:
var = np.var(data)
print(var)

11.918367346938775


$ Standard Deviation = \sigma = \sqrt{Variance}  $

In [208]:
std = np.std(data)
print(std)

3.4522988495984492


$ Quantiles  $

In [205]:
quartile_50 = np.quantile(data, q=0.5)
print(quartile_50)

4.0


In [206]:
quartile_25 = np.quantile(data, q=0.25)
print(quartile_25)

3.0


In [207]:
quartiles = np.quantile(data, q=[0.25, 0.5, 0.75])
print(quartiles)

[3. 4. 8.]


$ Interquartile Range (IQR) = Q3 - Q1  $

In [202]:
IQR = np.quantile(data, q=0.75) - np.quantile(data, q=0.25)
print(IQR)

5.0


In [214]:
px.box(y=X)