In [None]:
import pandas as pd

# further settings - optional:
pd.set_option("display.notebook_repr_html", False)  # disable "rich" output

import os
os.environ["COLUMNS"] = "74"  # output width, in characters
np.set_printoptions(linewidth=74)
pd.set_option("display.width", 74)


plt.style.use("seaborn")  # overall plot style

_colours = [  # the "R4" palette
    "#000000", "#DF536B", "#61D04F", "#2297E6",
    "#28E2E5", "#CD0BBC", "#F5C710", "#999999"
]

_linestyles = [
    "solid", "dashed", "dashdot", "dotted"
]

plt.rcParams["axes.prop_cycle"] = plt.cycler(
    # each plotted line will have a different plotting style
    color=_colours, linestyle=_linestyles*2
)
plt.rcParams["patch.facecolor"] = _colours[0]

np.random.seed(123)  # initialise the pseudorandom number generator

In [None]:
import numpy as np
heights = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/nhanes_adult_female_height_2020.txt")

In [None]:
heights[:6]

In [None]:
x = np.array([10, 20, 30, 40, 50, 60])
x

In [None]:
x = np.array((10, 20, 30, 40, 50, 60))
x

In [None]:
len(x)

In [None]:
x.shape

In [None]:
x.dtype

In [None]:
x.astype(float)

In [None]:
np.arange(0, 10, 2)

In [None]:
np.linspace(0, 1, 5)

In [None]:
help(np.linspace)

In [None]:
np.repeat(5, 6)

In [None]:
np.repeat([1, 2], 3)

In [None]:
x = np.repeat([1, 2], [3, 5])
np.repeat([1, 2], [3, 5])

In [None]:
type(x)

In [None]:
np.tile([1, 2], 3)

In [None]:
np.zeros(4)

In [None]:
np.ones(4)

In [None]:
np.repeat(1, 4).astype(float)

https://datawranglingpy.gagolewski.com/chapter/210-vector.html#numpy-r

**4.1.4. numpy.r_(*)**

In [None]:
import numpy as np

In [None]:
np.r_[1, 2, 3, np.nan, 5, np.inf]

In [None]:
np.r_[0:10:2]

In [None]:
np.r_[0:1:5j]

In [None]:
np.r_[1, 2, [3]*2, 0:3, 0:3:3j]

In [None]:
np.random.rand(5)

In [None]:
np.random.choice(np.arange(1, 10), 20)

In [None]:
plcu = np.loadtxt("https://raw.githubusercontent.com/gagolews/teaching-data/master/marek/population_largest_cities_unnamed.txt")

In [None]:
type(plcu)

In [None]:
plcu[0:10]

In [None]:
len(plcu)

In [None]:
plcu.shape

**4.2. Mathematical Notation**

In [None]:
x = np.array([5, 4, 2, 1, 3])  # just an example
x_sorted = np.sort(x)
x_sorted[0], x_sorted[-1]  # the minimum and the maximum

$ x = (x_1, x_2, ..., x_i, ..., x_n)$

$x_i$ is `x[i-1]` because the first element is at index 0

let $x \in \mathbb{R}^n$

The $\in$ symbol stands for "is in" or "is a member of"

$\mathbb{R}$ denotes a set of real numbers

$\mathbb{R}^n$ is a set of real-valued sequences of length $n$

**4.3. Inspecting the Data Distribution with Histograms**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.histplot(heights, bins = 11, color = "lightgray")
plt.show()

In [None]:
income = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/uk_income_simulated_2020.txt")
sns.histplot(income, stat="percent", bins=20, color="lightgray")
plt.show()

In [None]:
nhanes = np.loadtxt("https://raw.githubusercontent.com/gagolews/teaching-data/master/marek/nhanes_adult_female_weight_2020.txt")
sns.histplot(nhanes, bins = 25, color = "lightgray")
plt.show()

In [None]:
len(income)

In [None]:
plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st plot
sns.histplot(income, bins=5, color="lightgray")
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd plot
sns.histplot(income, bins=200, color="lightgray")
plt.ylabel(None)
plt.show()

In [None]:
counts, bins = np.histogram(income, 20)
counts

In [None]:
bins

In [None]:
bins[1] = 15000
bins[3] = 40000
bins[4] = 50000
bins[5] = 62000

In [None]:
sns.histplot(income, stat="percent", bins=bins, color="lightgray")
plt.show()

In [None]:
help(sns.histplot)

In [None]:
help(np.histogram_bin_edges)

In [None]:
#bins = np.histogram_bin_edges(income, bins = "fd")
bins = np.histogram_bin_edges(income, 
                              bins = "sturges"
                             )

In [None]:
sns.histplot(income, stat="percent", bins=bins, color="lightgray")
plt.show()

4.3.4. peds: A Bimodal Distribution (Already Binned)

In [None]:
peds = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/southern_cross_station_peds_2019_dec.txt")
peds

In [None]:
len(peds)

In [None]:
plt.bar(np.arange(0, 24), width=1, height=peds,
    color="lightgray", edgecolor="black", alpha=0.8)
plt.show()

In [None]:
matura = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/matura_2019_polish.txt")
plt.bar(np.arange(0, 71), width=1, height=matura,
    color="lightgray", edgecolor="black", alpha=0.8)
plt.show()

In [None]:
marathon = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/37_pzu_warsaw_marathon_mins.txt")

In [None]:
len(marathon)

In [None]:
marathon.shape

In [None]:
marathon[:5]

In [None]:
type(marathon)

In [None]:
sns.histplot(marathon[marathon < 180], color="lightgray")
plt.show()

In [None]:
sns.histplot(marathon, color="lightgray")
plt.show()

In [None]:
cities = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/other/us_cities_2000.txt")

In [None]:
large_cities = cities[cities >= 10000]

In [None]:
large_cities[-5:]

In [None]:
sns.histplot(large_cities, bins=20, color="lightgray")
plt.show()

In [None]:
sns.histplot(large_cities, bins=20, log_scale=True, color="lightgray")
plt.show()

In [None]:
sns.histplot(income, log_scale = True)
plt.show()

In [None]:
count, bins = np.histogram(income)

In [None]:
income[:5]

In [None]:
bins = np.geomspace(income, income[:1], num = 4)

In [None]:
sns.histplot(income, bins=bins, color="lightgray")
plt.show()

4.3.8. Cumulative Counts

In [None]:
sns.histplot(heights, stat="percent", cumulative=True, color="lightgray")
plt.show()

In [None]:
n = len(heights)
heights_sorted = np.sort(heights)
plt.plot(heights_sorted, np.arange(1, n+1)/n, drawstyle="steps-post")
plt.xlabel("$t$")  # LaTeX maths
plt.ylabel("$\\hat{F}_n(t)$, i.e., Prob(height $\\leq$ t)")
plt.show()

In [None]:
help(sns.ecdfplot)

In [None]:
help(np.linspace)

In [None]:
np.linspace(23, 44, num = 23)

In [None]:
help(np.arange)

In [None]:
np.arange(23, 44, 0.9)

How can I use the last output in the input for the next function?

In [None]:
.len()

In [None]:
help(next)