# Average of sampled data

Do imports.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

Create a random number generator (used to generate example data).

In [None]:
rng = np.random.default_rng()

Create example data.

In [None]:
# Choose number of data points
n = 100

# Choose mean and standard deviation of data points
mu_z = 1.2
std_z = 0.1

# Sample the chosen number of data points from a normal
# distribution with chosen mean and standard deviation
z = rng.normal(loc=mu_z, scale=std_z, size=n)

Find the mean (i.e., the average) of the sampled data.

In [None]:
mean_of_z = np.mean(z)

Compare the mean that we computed to the mean that was used to generate the data.

In [None]:
print(f'     mu_z = {mu_z:10.4f}')
print(f'mean_of_z = {mean_of_z:10.4f}')

Plot a histogram of the data.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 3))
ax.hist(z, bins=np.linspace(mu_z - 10 * std_z, mu_z + 10 * std_z, 33))
ax.plot([mean_of_z, mean_of_z], [0, n], '--', label='mean')
ax.grid()
ax.legend()
ax.set_ylim(0, 0.4 * n)
ax.set_xlabel('z')
ax.set_ylabel('count')
plt.show()

**Be careful!** Even one "outlier" — i.e., one data point that is very different from the others — can change the mean a lot.

In [None]:
# Add one outlier to the dataset
z_with_outlier = np.append(z, mu_z + (100. * std_z))

# Find the mean of the dataset with this outlier
mean_of_z_with_outlier = np.mean(z_with_outlier)

# Print the result
print(f'                  mu_z = {mu_z:10.4f}')
print(f'             mean_of_z = {mean_of_z:10.4f}')
print(f'mean_of_z_with_outlier = {mean_of_z_with_outlier:10.4f}')

Plot a histogram. (The outlier doesn't even show up in this plot!)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 3))
ax.hist(z_with_outlier, bins=np.linspace(mu_z - 10 * std_z, mu_z + 10 * std_z, 33))
ax.plot([mean_of_z, mean_of_z], [0, n], '--', label='mean')
ax.plot([mean_of_z_with_outlier, mean_of_z_with_outlier], [0, n], '-.', linewidth=2, label='mean with outlier')
ax.grid()
ax.legend()
ax.set_ylim(0, 0.4 * n)
ax.set_xlabel('z')
ax.set_ylabel('count')
plt.show()

One way to lessen the impact of outliers is to remove them, for example by using a "trimmed mean."

In [None]:
# Choose percent of data to exclude at each end of the distribution
percent_to_trim = 5

# Convert from percent to number of data points
number_to_trim = np.ceil(len(z_with_outlier) * (percent_to_trim / 100)).astype(int)
print(f'We will exclude {number_to_trim} of the smallest values and {number_to_trim} of the largest values:\n')

# Get trimmed dataset (sort by value and then truncate)
z_trimmed = np.sort(z_with_outlier)[number_to_trim:-number_to_trim]
print(f' length of original dataset: {len(z_with_outlier)}')
print(f'  length of trimmed dataset: {len(z_trimmed)}\n')

# Get mean of trimmed dataset
mean_of_z_trimmed = np.mean(z_trimmed)

# Print the result
print(f'                  mu_z = {mu_z:10.4f}')
print(f'             mean_of_z = {mean_of_z:10.4f}')
print(f'mean_of_z_with_outlier = {mean_of_z_with_outlier:10.4f}')
print(f'     mean_of_z_trimmed = {mean_of_z_trimmed:10.4f}')

Plot a histogram.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 3))
ax.hist(z_with_outlier, bins=np.linspace(mu_z - 10 * std_z, mu_z + 10 * std_z, 33))
ax.plot([mean_of_z, mean_of_z], [0, n], '--', label='mean')
ax.plot([mean_of_z_with_outlier, mean_of_z_with_outlier], [0, n], '-.', linewidth=2, label='mean with outlier')
ax.plot([mean_of_z_trimmed, mean_of_z_trimmed], [0, n], '-.', linewidth=2, label='mean with outlier (trimmed)')
ax.grid()
ax.legend()
ax.set_ylim(0, 0.4 * n)
ax.set_xlabel('z')
ax.set_ylabel('count')
plt.show()

The "trimmed mean" is only one example of a *robust mean* (or, more generally, a robust estimator). There are many other approaches, e.g., M-estimators, Huber regressors, RANSAC, bootstrapping, etc.