In [2]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [3]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x)

x_with_nan


[8.0, 1, 2.5, 4, 28.0]


[8.0, 1, 2.5, nan, 4, 28.0]

In [4]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

## Calculating Descriptive Statistics
      1. Measures of Central Tendency

In [5]:
mean_ = statistics.mean(x)
print(mean_)

#  fmean() is introduced in Python 3.8 as a faster alternative to mean(). It always returns a floating-point number.
mean_ = statistics.fmean(x)
mean_


8.7


8.7

In [6]:
mean_ = statistics.mean(x_with_nan)
print(mean_)

mean_ = statistics.fmean(x_with_nan)
mean_

nan


nan

In [8]:
mean_ = np.mean(y)
print(mean_)
# or
y.mean() #The function mean() and method .mean() from NumPy return the same result as statistics.mean()

8.7


8.7

In [9]:
# If you prefer to ignore nan values, then you can use np.nanmean():
np.nanmean(y_with_nan)

8.7

In [10]:
# pd.Series objects also have the method .mean():
# However, .mean() from Pandas ignores nan values by default:
z_with_nan.mean()

8.7

In [11]:
# Weighted Mean
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]
np.average(x, weights=w)

6.95

In [5]:
# Harmonic Mean
hmean = len(x) / sum(1 / item for item in x)
hmean


2.7613412228796843

In [7]:
scipy.stats.hmean(y)

2.7613412228796843

In [8]:
gmean = statistics.geometric_mean(x)
gmean

4.67788567485604

In [10]:
print(scipy.stats.gmean(y))

scipy.stats.gmean(z)

4.67788567485604


4.67788567485604

In [11]:
# Median
# The sample median is the middle element of a sorted dataset.
# The dataset can be sorted in increasing or decreasing order. If the number of elements 𝑛 of the dataset is odd, then the median is the value at the middle position: 0.5(𝑛 + 1).
# If 𝑛 is even, then the median is the arithmetic mean of the two values in the middle, that is, the items at the positions 0.5𝑛 and 0.5𝑛 + 1.
# The main difference between the behavior of the mean and median is related to dataset outliers or extremes.
# The mean is heavily affected by outliers, but the median only depends on outliers either slightly or not at all.
n = len(x)
if n % 2:
    median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1] + x_ord[index])

median_

4

In [19]:
median_ = statistics.median(x)
median_

median_ = statistics.median(x[:-1]) # The sorted version of x[:-1], which is x without the last item 28.0, is [1, 2.5, 4, 8.0]
median_

3.25

In [20]:
# Unlike most other functions from the Python statistics library, median(), median_low(), and median_high() don’t return nan when there are nan values among the data points:
print(statistics.median(x_with_nan))

# If the number of elements is even, then there are two middle values. 
# In this case, median_low() returns the lower and median_high() the higher middle value.

print(statistics.median_low(x_with_nan))

print(statistics.median_high(x_with_nan))

6.0
4
8.0


In [21]:
print(np.nanmedian(y_with_nan))

np.nanmedian(y_with_nan[:-1])

4.0


3.25

In [24]:
# Mode
# The sample mode is the value in the dataset that occurs most frequently. 
# If there isn’t a single such value, then the set is multimodal since it has multiple modal values. For example, in the set that contains the points 2, 3, 2, 8, and 12, the number 2 is the mode because it occurs twice, unlike the other items that occur only once.

#  mode with pure Python
u = [2, 3, 2, 8, 12, 5, 9, 3, 5, 7, 5]

mode_ = max((u.count(item), item) for item in set(u))[1]
mode_

5

In [31]:
v = [15, 12, 34, 12, 15, 12, 15, 21, 15, 12]
print(statistics.mode(v))  # returns the first value having highest frequency
statistics.multimode(v)


15


[15, 12]

In [32]:
u, v = np.array(u), np.array(v)
mode_ = scipy.stats.mode(u)
print(mode_)

mode_ = scipy.stats.mode(v) # If there are multiple modal values in the dataset, then only the smallest value is returned.
mode_

ModeResult(mode=array([5]), count=array([3]))


ModeResult(mode=array([12]), count=array([4]))

In [33]:
# You can get the mode and its number of occurrences as NumPy arrays with dot notation:
print(mode_.mode)

mode_.count

[12]


array([4])

In [35]:
# Pandas Series objects have the method .mode() that handles multimodal values well and ignores nan values by default:
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, 2, math.nan])
print(u.mode(),'\n')

print(v.mode())

w.mode()
#  If you want .mode() to take nan values into account, then just pass the optional argument dropna=False.

0    5
dtype: int32 

0    12
1    15
dtype: int32


0    2.0
dtype: float64