# 5. Processing Unidimensional Data

## 5.1 Aggregating Numeric Data

### 5.1.1 Measures of Location

#### 5.1.1.1 Arithmetic Mean and Median

In [None]:
import numpy as np

In [None]:
heights = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
                     "teaching-data/master/marek/nhanes_adult_female_height_2020.txt")

In [None]:
np.mean(heights), np.median(heights)

In [None]:
income = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
                    "teaching-data/master/marek/uk_income_simulated_2020.txt")

In [None]:
np.mean(income), np.median(income)

In [None]:
marathon = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
                      "teaching-data/master/marek/37_pzu_warsaw_marathon_mins.txt")

In [None]:
np.mean(marathon), np.median(marathon)

##### Exercise 5.2

In [None]:
x = np.sort(marathon)

In [None]:
type(x)

In [None]:
length = len(marathon)

In [None]:
type(length)

In [None]:
x[2]

In [None]:
med_location = length//2
med_location

In [None]:
type(med_location)

In [None]:
test = x[med_location]
test

In [None]:
type(test)

In [None]:
def amedian(data):
    """
    find the median adam's way
    """
    x = np.sort(data)
    length = len(data)
    med_location = length//2
    return x[med_location]

In [None]:
am = amedian(marathon)

In [None]:
print(am)

In [None]:
print(amedian(marathon))

In [None]:
amedian(marathon) == np.median(marathon)

In [None]:
244 == 244

In [None]:
m = np.median(marathon)

In [None]:
type(m)

In [None]:
print(am)

In [None]:
type(am)

In [None]:
print(am)

In [None]:
amedian(marathon)

If you do not use return at the end of a function, it will return `NoneType`

In [None]:
help("np.mean")

In [None]:
income.mean()

In [None]:
income.shape[0]

In [None]:
income.sum()

In [None]:
income.size

#### 5.1.1.2 Sensitive to Outliers vs Robust

In [None]:
income2 = np.append(income, [1000000000])
print(np.mean(income), np.mean(income2))

In [None]:
print(np.median(income), np.median(income2))

#### 5.1.1.3 Sample Quantiles

In [None]:
np.quantile(heights, [0, 0.25, 0.5, 0.75, 1])

In [None]:
np.quantile(income, [0, 0.25, 0.5, 0.75, 1])

In [None]:
np.quantile(income, [0.025, 0.975])

### 5.1.2 Measures of Dispersion

In [None]:
np.std(heights), np.std(income)

In [None]:
np.quantile(heights, 0.75) - np.quantile(heights, 0.25)

In [None]:
np.quantile(income, 0.75) - np.quantile(income, 0.25)

### Measures of Shape

In [None]:
import scipy.stats

In [None]:
scipy.stats.skew(heights)

In [None]:
scipy.stats.skew(income)

In [None]:
scipy.stats.kurtosis(heights)

In [None]:
scipy.stats.kurtosis(income)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.subplot(4, 1, 1)  # 2 rows, 1 column, 1st subplot
sns.boxplot(data=heights, orient="h", color="lightgray")
plt.plot(np.mean(heights), 0, "wX")
plt.yticks([0], ["heights"])
plt.subplot(4, 1, 2)  # 2 rows, 1 column, 2nd subplot
sns.boxplot(data=income, orient="h", color="lightgray")
plt.plot(np.mean(income), 0, "wX")
plt.yticks([0], ["income"])
plt.subplot(4, 1, 3)  # 2 rows, 1 column, 1st subplot
sns.violinplot(data=heights, orient="h", color="lightgray")
plt.yticks([0], ["heights"])
plt.subplot(4, 1, 4)  # 2 rows, 1 column, 2nd subplot
sns.violinplot(data=income, orient="h", color="lightgray")
plt.yticks([0], ["income"])
plt.show()

## 5.2 Vectorised Mathematical Functions

In [None]:
np.round([-3.249, -3.151, 2.49, 2.51, 3.49, 3.51], 1)

In [None]:
plt.subplot(1, 2, 1)
x = np.linspace(np.exp(-2), np.exp(3), 1001)
plt.plot(x, np.log(x), label="$y=\\log x$")
plt.legend()
plt.subplot(1, 2, 2)
x = np.linspace(-2, 3, 1001)
plt.plot(x, np.exp(x), label="$y=\\exp(x)$")
plt.legend()
plt.show()

In [None]:
10.0**np.array([-1, 0, 1, 2]) 

In [None]:
np.log10([-1, 0.01, 0.1, 1, 2, 5, 10, 100, 1000, 10000])

In [None]:
x = np.linspace(-2*np.pi, 4*np.pi, 1001)
plt.plot(x, np.cos(x))
plt.xticks(
    [-2*np.pi, -np.pi, 0, np.pi/2, np.pi, 3*np.pi/2, 2*np.pi, 4*np.pi],
    ["$-2\\pi$", "$-\\pi$", "$0$", "$\\pi/2$", "$\\pi$",
     "$3\\pi/2$", "$2\\pi$", "$4\\pi$"]
)
plt.show()

In [None]:
print(x)

In [None]:
len(x)

In [None]:
6.28318531 / 3.1415

In [None]:
6.28318531 / np.pi

Identities worth memorising:

$$ \sin(x) = \cos(\frac{\pi}{2} - x) $$
$$ \cos(-x) = \cos(x) $$
$ \cos^2(x) + \sin^2(x) = 1 $, where $ \cos^2(x) = (\cos(x))^2 $

$$ \cos(x + y) = \cos(x)\cos(y) - \sin(x)\sin(y)$$
$$ \cos(x - y) = \cos(x)\cos(y) + \sin(x)\sin(y)$$

In [None]:
np.array([-2, -1, 0, 1, 2, 3])**2

In [None]:
(np.array([-2, -1, 0, 1, 2, 3])+2)/5

In [None]:
np.array([-2, -1, 0, 1, 2, 3])+2/5

pedmas:
1. parenthesis
2. exponent
3. divide
4. multiply
5. add
6. subtract

#### 5.3.2.1 Standardisation and Z-score

In [None]:
heights[-5:]

In [None]:
np.mean(heights), np.std(heights)

In [None]:
heights_std = (heights - np.mean(heights)) / np.std(heights)
heights_std[-5:]

$$ z_{i} = \frac{x_{i} - \bar{x}}{s} $$

In [None]:
np.mean(heights_std), np.std(heights_std)

#### 5.3.2.2 Min-Max Scaling and Clipping

In [None]:
x = np.array([-1.5, 0.5, 3.5, -1.33, 0.25, 0.8])
(x - np.min(x))/(np.max(x) - np.min(x))

$$ \frac{x_{i} - x_{min}}{x_{max} - x_{min}} $$

In [None]:
np.clip(x, 0, 1)

In [None]:
np.max(x)

In [None]:
print(x)

In [None]:
np.maximum(0, x)

In [None]:
np.minimum(1, x)

In [None]:
np.minimum(1, np.maximum(0, x))

In [None]:
help(np.minimum)

#### 5.3.2.3 Normalisation (l_2; Dividing by Magnitude)

$$ ||(x_{1},\ldots,x_{n})|| = \sqrt{x_{1}^{2} + x_{2}^{2} + \ldots + x_{n}^{2}} = \sqrt{\sum_{i=1}^{n} x_{i}^{2}} $$

In [None]:
x = np.array([1, 5, -4, 2, 2.5])
x/np.sqrt(np.sum(x**2))

In [None]:
len(x)

In [None]:
x / np.sqrt(len(x))

#### 5.3.2.4 Normalisation (l_1; Dividing by Sum)

In [None]:
x /np.sum(np.abs(x))

In [None]:
c, b = np.histogram(heights, [-np.inf, 150, 160, 170, np.inf])
print(c) 

In [None]:
p = c/np.sum(c)
print(p)

### 5.3.3 Vector-Vector Case

In [None]:
np.array([2, 3, 4, 5]) * np.array([10, 100, 1000, 10000])

In [None]:
 p = np.array([0.1, 0.3, 0.25, 0.15, 0.12, 0.08]) 
-np.sum(p*np.log(p))

In [None]:
x = np.linspace(-np.pi, 1.5*np.pi, 1001)  # many points in the said interval
yf = np.sin(x)
yg = x - x**3/6 + x**5/120 - x**7/5040
plt.plot(x, yf, 'k-', label="f(x)")  # black solid line
plt.plot(x, yg, 'r:', label="g(x)")  # red dotted line
plt.legend()
plt.show()

In [None]:
weights = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
                     "teaching-data/master/marek/nhanes_adult_female_weight_2020.txt")

In [None]:
fbmi = weights/(heights)
print(fbmi)

In [None]:
min(fbmi)

In [None]:
max(fbmi)

In [None]:
sns.boxplot(data=fbmi, orient="h", color="lightgray")
plt.plot(np.mean(fbmi), 0, "wX")
plt.show()

In [None]:
x = [10, 20, 30, 40, 50]
x[1:2]

In [None]:
x[0]

In [None]:
type(x[1:2])

### 5.4.1 Interger Indexing

In [None]:
x = np.array([10, 20, 30, 40, 50])
x[-1] # last


In [None]:
x[ [3] ]

In [None]:
x[ [3, 2, 1, 0, 4] ]

In [None]:
x[ [] ]

### 5.4.2 Logical Indexing

In [None]:
x[ [True, False, True, True, False] ]

In [None]:
x >= 30

In [None]:
x[ x >= 30 ]

In [None]:
y = (x/10) % 2

In [None]:
y

In [None]:
x [y == 0]

In [None]:
x[ (20 <= x) & (x <= 34) ]

In [None]:
g = (heights >= 150) & (heights <= 170)

In [None]:
gb = fbmi[g]

In [None]:
len(gb)

In [None]:
len(heights)

### 5.4.3 Slicing

In [None]:
x[::-1]

In [None]:
x

In [None]:
x[3:]

In [None]:
x[1:4]

In [None]:
y = np.array([6, 4, 8, 5, 1, 3, 2, 9, 7])

In [None]:
y[::2] *= 10

In [None]:
y

In [None]:
y[ [1, 3, 5, 7] ] *= 10

In [None]:
y

In [None]:
y[ [1, 3, 5, 7] ] //= 10

In [None]:
y

## 5.5 Other Operations

### 5.5.1 Cumulative Sums and Iterated Differences

In [None]:
np.cumsum([5, 3, -4, 1, 1, 3])

In [None]:
np.diff([5, 8, 4,5, 6, 9])

### 5.5.2 Sorting

In [None]:
x = np.array([40, 10, 20, 40, 40, 30, 20, 40, 50, 10, 10, 70, 30, 40, 30])

In [None]:
np.sort(x)

In [None]:
np.random.permutation(x)

### 5.5.3 Dealing with Tied Observations

In [None]:
x = np.array([40, 10, 20, 40, 40, 30, 20, 40, 50, 10, 10, 70, 30, 40, 30])

In [None]:
np.unique(x)

In [None]:
x

In [None]:
np.unique(x, return_counts=True)

In [None]:
np.all(np.unique(x, return_counts=True)[1] == 1)

In [None]:
help(np.unique)

In [None]:
np.unique(x, return_index=True)

### 5.5.4 Determining the Ordering Permutation and Ranking

In [None]:
x = np.array([40, 10, 20, 40, 40, 30, 20, 40, 50, 10, 10, 70, 30, 40, 30])

In [None]:
np.argsort(x)

In [None]:
x[np.argsort(x)]

In [None]:
x[1]

In [None]:
x[9]

In [None]:
x[10]

In [None]:
x = np.array([10, 40, 50, 20, 30, 10])

In [None]:
x

In [None]:
scipy.stats.rankdata(x)

In [None]:
scipy.stats.rankdata(x, method = "max")

In [None]:
scipy.stats.rankdata(x, method = "ordinal")

In [None]:
help(scipy.stats.rankdata)

### 5.5.5 Searching for Certain Indexes (Argmin, Argmax)