### Download Necessary Files

In [1]:
from os.path import basename, exists


def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve

        local, _ = urlretrieve(url, filename)
        print("Downloaded " + local)


download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/hinc.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/hinc06.csv")


In [2]:
import numpy as np

import thinkstats2
import thinkplot

### Excercise 5.1

#### Percentage of the U.S. male population

In [3]:
import scipy.stats

# Normal Distribution of men
m_mu = 178
m_sigma = 7.7
dist = scipy.stats.norm(loc=m_mu, scale=m_sigma)

In [4]:
low = dist.cdf(177.8)  # 5'10" converted into 177.8 CM
high = dist.cdf(185.4)  # 6'1" converted into 185.4
high - low

np.float64(0.3420946829459531)

##### Findings: Around 34% of Male are between 178 CM to 185 CM

### Excercise 5.2

In [5]:
alpha = 1.7
xmin = 1  # meter
dist = scipy.stats.pareto(b=alpha, scale=xmin)
dist.median()

np.float64(1.5034066538560549)

What is the mean height in Pareto world?

In [6]:
dist.mean()

np.float64(2.428571428571429)

What fraction of people are shorter than the mean?

In [7]:
dist.cdf(dist.mean())

np.float64(0.778739697565288)

### Excercise 6.1

In [8]:
#Copyied from hinc2

def InterpolateSample(df, log_upper=6.0):
    """Makes a sample of log10 household income.

    Assumes that log10 income is uniform in each range.

    df: DataFrame with columns income and freq
    log_upper: log10 of the assumed upper bound for the highest range

    returns: NumPy array of log10 household income
    """
    # compute the log10 of the upper bound for each range
    df['log_upper'] = np.log10(df.income)

    # get the lower bounds by shifting the upper bound and filling in
    # the first element
    df['log_lower'] = df.log_upper.shift(1)
    df.loc[0, 'log_lower'] = 3.0

    # plug in a value for the unknown upper bound of the highest range
    df.loc[41, 'log_upper'] = log_upper
    
    # use the freq column to generate the right number of values in
    # each range
    arrays = []
    for _, row in df.iterrows():
        vals = np.linspace(row.log_lower, row.log_upper, int(row.freq))
        arrays.append(vals)

    # collect the arrays into a single sample
    log_sample = np.concatenate(arrays)
    return log_sample


In [9]:
import hinc
income_df = hinc.ReadData() #Read data from hinco6.csv
log_sample = InterpolateSample(income_df, log_upper=6.0)
sample = np.power(10, log_sample)

In [10]:
#Mean,Median

print("Mean of the Sample ",np.mean(sample))
print("Median of the Sample",np.median(sample))

Mean of the Sample  74278.7075311872
Median of the Sample 51226.93306562372


In [11]:
from scipy.stats import skew
from scipy.stats import pareto
print("Skew of the Sample ",skew (sample))
print("pareto Skew of the Sample ",pareto.stats (sample,moments='s'))

Skew of the Sample  4.949920244429585
pareto Skew of the Sample  [2.00601504 2.00601273 2.00601043 ... 2.00000601 2.000006   2.000006  ]


In [12]:
# Calculate the histogram
hist, bin_edges = np.histogram(sample, bins=10, density=True)

# Calculate the CDF
cdf = np.cumsum(hist) * np.diff(bin_edges)

print (cdf)

[0.78485685 0.95578892 0.97939702 0.98431299 0.98813471 0.99125414
 0.99389178 0.99617828 0.9981953  1.        ]
