# Statistics Notes
This notebook was drafted by Thomas M Hughes to provide a series of basic functions in Python for solving common problems in statistics.  It also includes some descriptions and notes, all for a central repository of information.

In [None]:
## Hacker Rank Read from stdin
import sys

N = int(sys.stdin.readline())
nums = [int(n) for n in sys.stdin.readline().split()]

## Measures of Central Tendency
### Mean
The *mean* is the simple average of a distribution of numbers.

### Median
The *median* is the value that occurs at the exact middle of a distribution of numbers.  The distribution must be sorted.  If the median falls between two values, pick the value exactly between the two.

### Mode
The *mode* is the value that occurs most frequency in a list of numbers.

In [None]:
# Mean takes a list of numbers, and returns the average value for that list
def mean(l):
    return sum(l)/len(l) 
    
# Takes a list of numbers and returns the value at the center of the distribution
def median(l):
    l.sort()    # Order matters in a median
    ln = len(l) # Take the length calculation once and save it, since we use it a lot
    
    # If the number of elements in the distribution is even, find the value between the two middle values
    if ln%2 == 0:
        return (l[ln/2] + l[(ln/2)-1])/2.
    
    # Otherwise, the number of elements is odd, and we can just take the middle element
    else:
        return 1.*l[(ln/2)]

# Takes a distribution of numbers, and returns the number that is most frequent in that distribution; 
# if multiple, takes smallest
def mode(l):
    l.sort()
    d = {}
    for n in l:
        if n in d:
            d[n]+=1
        else:
            d[n]=1
    
    
    return max(d, key=d.get)

In [None]:
# Takes a dict of key:value as value:weight
def weighted_mean(d):
    wt = 0
    for n, w in d.items():
        wt += w
    
    t = 0
    
    for n, w in d.items():
        t += n * (1.*w/wt)
    
    return t

In [None]:
def quartiles(l):
    l.sort()
    Q2 = median(l)
    
    ln = len(l)
    
    if ln %2 != 0:
        l.remove(Q2)
    
    Q1 = median(l[:ln/2])
    Q3 = median(l[ln/2:])
    
    return Q1, Q2, Q3
    

In [None]:
def IQR(l):
    q1, q2, q3 = quartiles(l)
    
    return q3-q1

In [None]:
def variance(l):
    mu = mean(l)
    n = 1.*len(l)
        
    t = sum([(v-mu)**2 for v in l])
    
    return t/n
    
def stdev(l):
    return variance(l)**(1./2)

In [None]:
from operator import mul

def product(i):
    return reduce(mul, i, 1)

def compound_event_probability(l):
    return product(l)

### Binomial Variables and Distributions

In [None]:
def factorial(x):
    return product(range(2,x+1))
        
def combination(n, x):
    return factorial(n)/(factorial(x)*factorial(n-x))

# Getting exactly x instances out of n cases, where x has probability p for each case
def binomial_probability(x, n, p):
    q = 1 - p
    return combination(n, x) * p ** x * q ** (n-x)

def at_least_binomial_probability(r, n, p):
    i = range(r, n+1)

    return sum([binomial_probability(v, n, p) for v in i])

def at_most_binomial_probability(r, n, p):
    i = range(0, r+1)
    
    return sum([binomial_probability(v, n, p) for v in i])

In [None]:
# If X is the number of experiments until the xth success occurs, 
# then X is a discrete random variable called a negative binomial. 
def negative_binomial(x, n, p):
    q = 1 - p
    return (combination((n-1), (x-1))*p**x*q**(n-x))

# A Geometric Distribution is a negative binomial with exactly one success.
# 'n' is number of trials, 'p' is probability per trial
def geometric_distribution(n, p):
    return negative_binomial(1, n, p)

# Cumulative geometric -- success within first n trials
def cumulative_geometric_distribution(n, p):
    return sum([geometric_distribution(v, p) for v in range(1,n+1)])

In [None]:
# Probably of getting exactly k successes when average number of successes is lda
def poisson_distribution(k, lda):
    return lda**k*e**(-1*lda) / factorial(k)

In [None]:
import math
e = math.e
pi = math.pi

def normal_distribution_probability_density(x, mu, var):
    return (1./(var**(1./2)*(2.*pi)**(1./2)))*e**(-1.*(x-mu)**2./(2.*var))

In [None]:
# Cumulative Density Function
def cdf(x, mu, var):
    return (1./2)*(1+math.erf((x-mu)/(var**(1./2)*2**(1./2))))

def cdf_range(x, y, mu, var):
    return cdf(y, mu, var)-cdf(x, mu, var)