# Means | Measures of Central Tendency

## Arithmetic mean

In [1]:
import scipy.stats as stats
import numpy as np

# We'll use these two data sets as examples
x1 = [1, 2, 2, 3, 4, 5, 5, 7]
x2 = x1 + [100]

print ('Mean of x1:', sum(x1), '/', len(x1), '=', np.mean(x1))
print ('Mean of x2:', sum(x2), '/', len(x2), '=', np.mean(x2))

Mean of x1: 29 / 8 = 3.625
Mean of x2: 129 / 9 = 14.333333333333334


## Median

In [2]:
print ('Median of x1:', np.median(x1))
print ('Median of x2:', np.median(x2))

Median of x1: 3.5
Median of x2: 4.0


## Mode

In [3]:
# Scipy has a built-in mode function, but it will return exactly one value
# even if two values occur the same number of times, or if no value appears more than once
print ('One mode of x1:', stats.mode(x1)[0][0])

# So we will write our own
def mode(l):
    # Count the number of times each element appears in the list
    counts = {}
    for e in l:
        if e in counts:
            counts[e] += 1
        else:
            counts[e] = 1
            
    # Return the elements that appear the most times
    maxcount = 0
    modes = {}
    for (key, value) in counts.items():
        if value > maxcount:
            maxcount = value
            modes = {key}
        elif value == maxcount:
            modes.add(key)
            
    if maxcount > 1 or len(l) == 1:
        return list(modes)
    return 'No mode'
    
print ('All of the modes of x1:', mode(x1))

One mode of x1: 2
All of the modes of x1: [2, 5]


For data that can take on many different values, such as returns data, there may not be any values that appear more than once. In this case we can bin values, like we do when constructing a histogram, and then find the mode of the data set where each value is replaced with the name of its bin. That is, we find which bin elements fall into most often.

In [6]:
from nsepy import get_history
from datetime import date, datetime, timedelta
import pandas as pd

In [7]:
start = date(2010,1,1)
end = date.today()

data = get_history(symbol="TCS", start=start, end=end)

data = data.reset_index()
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

display(data.info())
display(data.tail())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2280 entries, 2010-01-04 to 2019-03-01
Data columns (total 14 columns):
Symbol                2280 non-null object
Series                2280 non-null object
Prev Close            2280 non-null float64
Open                  2280 non-null float64
High                  2280 non-null float64
Low                   2280 non-null float64
Last                  2280 non-null float64
Close                 2280 non-null float64
VWAP                  2280 non-null float64
Volume                2280 non-null int64
Turnover              2280 non-null float64
Trades                1925 non-null float64
Deliverable Volume    2280 non-null int64
%Deliverble           2280 non-null float64
dtypes: float64(10), int64(2), object(2)
memory usage: 267.2+ KB


None

Unnamed: 0_level_0,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-02-25,TCS,EQ,1925.65,1932.5,1990.0,1930.5,1987.0,1985.15,1961.61,2934880,575709500000000.0,140541.0,1341508,0.4571
2019-02-26,TCS,EQ,1985.15,1984.0,2045.15,1972.25,2032.5,2038.7,2022.12,6453309,1304935000000000.0,177907.0,3366710,0.5217
2019-02-27,TCS,EQ,2038.7,2040.0,2074.95,2022.0,2052.5,2058.1,2050.25,4732082,970194700000000.0,168823.0,2532055,0.5351
2019-02-28,TCS,EQ,2058.1,2060.0,2071.35,1977.6,1983.5,1983.45,2009.42,8454295,1698821000000000.0,284872.0,3927877,0.4646
2019-03-01,TCS,EQ,1983.45,1995.05,2005.0,1980.35,1995.0,1995.4,1993.73,4148548,827106400000000.0,128204.0,2071633,0.4994


In [9]:
pricing = pd.DataFrame(data['Close'].copy())
pricing.tail()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2019-02-25,1985.15
2019-02-26,2038.7
2019-02-27,2058.1
2019-02-28,1983.45
2019-03-01,1995.4


In [11]:
# Get return data for an asset and compute the mode of the data set
returns = pricing.pct_change()[1:]
print ('Mode of returns:', mode(returns))

Mode of returns: No mode


In [12]:
# Since all of the returns are distinct, we use a frequency distribution to get an alternative mode.
# np.histogram returns the frequency distribution over the bins as well as the endpoints of the bins
hist, bins = np.histogram(returns, 20) # Break data up into 20 bins
maxfreq = max(hist)
# Find all of the bins that are hit with frequency maxfreq, then print the intervals corresponding to them
print ('Mode of bins:', [(bins[i], bins[i+1]) for i, j in enumerate(hist) if j == maxfreq])

Mode of bins: [(-0.0032268023346224695, 0.028106090403132278)]


## Geometric mean

In [14]:
# Use scipy's gmean function to compute the geometric mean
print('Geometric mean of x1:', stats.gmean(x1))
print('Geometric mean of x2:', stats.gmean(x2))

Geometric mean of x1: 3.0941040249774403
Geometric mean of x2: 4.552534587620071


In [21]:
# Add 1 to every value in the returns array and then compute R_G
ratios = returns + 1
R_G = stats.gmean(ratios) - 1
print('Geometric mean of returns:', R_G)

Geometric mean of returns: [0.00042849]


The geometric mean is defined so that if the rate of return over the whole time period were constant and equal to  $R_G$ , the final price of the security would be the same as in the case of returns  $R_1,…,R_T$ .

In [30]:
T = len(returns)
init_price = pricing['Close'][0]
final_price = pricing['Close'][T]
print('Initial price:', init_price)
print('Final price:', final_price)
print('Final price as computed with R_G:', init_price*(1 + R_G)**T)

Initial price: 751.65
Final price: 1995.4
Final price as computed with R_G: [1995.4]


## Harmonic Mean

In [31]:
print('Harmonic mean of x1:', stats.hmean(x1))
print('Harmonic mean of x2:', stats.hmean(x2))

Harmonic mean of x1: 2.5590251332825593
Harmonic mean of x2: 2.869723656240511


The harmonic mean can be used when the data can be naturally phrased in terms of ratios. For instance, in the dollar-cost averaging strategy, a fixed amount is spent on shares of a stock at regular intervals. The higher the price of the stock, then, the fewer shares an investor following this strategy buys. The average (arithmetic mean) amount they pay for the stock is the harmonic mean of the prices.

# Variance | Measures of Dispersion

In [32]:
# Import libraries
import numpy as np

np.random.seed(42)

In [33]:
# Generate 20 random integers < 100
X = np.random.randint(100, size=20)

# Sort them
X = np.sort(X)
print ('X: %s' %(X))

mu = np.mean(X)
print ('Mean of X:', mu)

X: [ 1  2 14 20 21 23 29 37 51 52 60 71 74 74 82 86 87 87 92 99]
Mean of X: 53.1


## Range

Difference between the maximum and minimum values

In [34]:
print('Range of X: %s' %(np.ptp(X)))

Range of X: 98


## Mean Absolute Deviation (MAD)

In [35]:
abs_dispersion = [np.abs(mu - x) for x in X]
MAD = np.sum(abs_dispersion)/len(abs_dispersion)
print('Mean absolute deviation of X:', MAD)

Mean absolute deviation of X: 28.099999999999994


## Variance and standard deviation

In [36]:
print('Variance of X:', np.var(X))
print('Standard deviation of X:', np.std(X))

Variance of X: 990.49
Standard deviation of X: 31.472051092993606


One way to interpret standard deviation is by referring to **Chebyshev's inequality**. This tells us that the proportion of samples within  $k$  standard deviations (that is, within a distance of  $k$⋅  standard deviation) of the mean is at least  $1−1/k^2$  for all  $k>1$ .

Let's check that this is true for our data set.

In [37]:
k = 1.25
dist = k*np.std(X)
l = [x for x in X if abs(x - mu) <= dist]
print('Observations within', k, 'stds of mean:', l)
print('Confirming that', float(len(l))/len(X), '>', 1 - 1/k**2)

Observations within 1.25 stds of mean: [14, 20, 21, 23, 29, 37, 51, 52, 60, 71, 74, 74, 82, 86, 87, 87, 92]
Confirming that 0.85 > 0.36


## Semivariance and semideviation

only count the observations that fall below the mean

In [38]:
# Because there is no built-in semideviation, we'll compute it ourselves
lows = [e for e in X if e <= mu]

semivar = np.sum( (lows - mu) ** 2 ) / len(lows)

print ('Semivariance of X:', semivar)
print ('Semideviation of X:', np.sqrt(semivar))

Semivariance of X: 1073.21
Semideviation of X: 32.759884004678646


A related notion is target semivariance (and target semideviation), where we average the distance from a target of values which fall below that target:

In [39]:
B = 19
lows_B = [e for e in X if e <= B]
semivar_B = sum(map(lambda x: (x - B)**2,lows_B))/len(lows_B)

print('Target semivariance of X:', semivar_B)
print('Target semideviation of X:', np.sqrt(semivar_B))

Target semivariance of X: 212.66666666666666
Target semideviation of X: 14.583095236151571
