In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from statsmodels import regression
import statsmodels.api as sm

In [None]:
# Generate 20 random integers < 100, for using them during the notebook!
X = np.random.randint(100, size=20)   
Y = np.random.randint(100, size=20)

# Python Tutorial - Statistics

The theory is covered in my notes. Here I give the code for various concepts!

## 1. Central Tendency

### - Arithmetic mean

$$\mu = \frac{\sum_{i=1}^N X_i}{N}$$


In [None]:
# Mean
np.mean(X)

### - Median

$$N/2 \ \text{or} \ (N+2)/2$$


In [None]:
# Median
np.median(X)     

### - Mode

Most frequently occuring value in a data set.

In [None]:
# Mode (Only the first one. No built in-function for all of them)
stats.mode(X)[0][0]

### - Geometric mean

$$ G = \sqrt[N]{X_1X_1\ldots X_N} $$

In [None]:
# Geometric mean
stats.gmean(X) 

### - Harmonic mean

$$ H = \frac{N}{\sum_{i=1}^N \frac{1}{X_i}} $$

In [None]:
# Harmonic mean
stats.hmean(X)        

### - Quantiles

Cutpoints dividing the range of a probability distribution into contiguous intervals with equal probabilities.

In [None]:
# q-Quantiles (Here q=50, i.e the median)
np.percentile(X,q=50)

<br>
<hr>

## 2. Measures of Dispersion


### - Range

Difference between the maximum and minimum values in a dataset.

In [None]:
# Range
np.ptp(X) 

### - Mean Absolute Deviation (MAD)

$$ MAD = \frac{\sum_{i=1}^N |X_i - \mu|}{N} $$


In [None]:
# Mean Absolute Deviation
mu = np.mean(X)
abs_dispersion = [np.abs(mu - x) for x in X]       
np.sum(abs_dispersion)/len(abs_dispersion)

### - Variance

$$ \sigma^2 = \frac{\sum_{i=1}^N (X_i - \mu)^2}{N} $$

In [None]:
# Variance
np.var(X) 

### - Standard Deviation

$$ s = \sqrt{\sigma^2} $$

In [None]:
# Standard Deviation
np.std(X)  

### - Semivariance & Semideviation

$$ \frac{\sum_{X_i < \mu} (X_i - \mu)^2}{N_<} $$

In [None]:
# Semivariance
mu = np.mean(X)
lows = [e for e in X if e <= mu]                         
np.sum( (lows - mu) ** 2 ) / len(lows)

# Semideviation
np.sqrt(semivariance)

### - Coefficient of Variation

Coefficient of Variation (CV) is also known as relative despression (relative to a benchmark) and it is the standar deviation divided by the mean. 

$$ CV = \frac{s}{\mu} $$

In [None]:
# Coefficient of Variation
stats.variation(X)

### - Skew

$$ S_{k} =\frac{N}{(N-1)(N-2)} \sum_{i=1}^N  \frac{(X_i - \mu)^3}{\sigma^3} $$

In [None]:
# Coefficient of Variation
stats.skew(X)

N### - Kurtosis

$$ K = \frac{N(N+1}{(N-1)(N-2)(N-3)} \sum_{i=1}^N  \frac{(X_i - \mu)^4}{\sigma^4} $$

In [None]:
# Coefficient of Variation
stats.kurtosis(X)

<br>
<hr>

## 3. Bivariate Relationships

### - Covariance

$$Cov(X,Y) = \frac{\sum_{i=1}^N (X_i - \mu_X) (Y_i - \mu_Y)}{N}$$

In [None]:
# Covariance matrix
np.cov(X,Y)

### - Correlation

$$r(X,Y) = \frac{Cov(X,Y)}{std(X)std(Y)}$$

In [None]:
# Correlation matrix
np.corrcoef(X, Y)

### - Spearman Rank Correlation

Instead of looking at the relationship between the two variables, we look at the relationship between the ranks. This is robust to outliers and the scale of the data. This is useful when your data sets may be in different units, and therefore not linearly related and for data sets which not satisfy the assumptions that other tests require, such as the observations being normally distributed as would be necessary for a t-test.

$$r_S = 1 - \frac{6 \sum_{i=1}^n d_i^2}{n(n^2 - 1)}$$

where $d_i$ is the difference between the ranks of the $i$th pair of observations, $X_i - Y_i$.


In [None]:
# Spearman Rank Correlation
stats.spearmanr(X, Y)

## 4. Linear Regression

### - Linear Regression

In [None]:
# Our own function: Performs linear regression, prints details and plots the results.
def linreg(X,Y):
    # Running the linear regression
    X = sm.add_constant(X)
    model = regression.linear_model.OLS(Y, X).fit()
    a = model.params[0]
    b = model.params[1]
    X = X[:, 1]

    # Return summary of the regression and plot results
    X2 = np.linspace(X.min(), X.max(), 100)
    Y_hat = X2 * b + a
    plt.figure(figsize=(12,8))
    sns.regplot(X,Y, scatter_kws={'color':'red'})  
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title('Linear Regression', size=15)
    plt.show()
    return model.summary()

linreg(X,Y)

In [None]:
# Apply Linear Regression via scipy
slope, intercept, r_value, p_value, std_err = stats.linregress(x = X, y = Y)

# Apply Linear Regression via scikit
# We need this reshape for scikit to work
X_reshaped = X.reshape(len(X),1)       
LinearRegression().fit(X_reshaped,Y)