In [4]:
pip install thinkx

Collecting thinkx
  Using cached thinkx-1.1.3.tar.gz (41 kB)
Collecting markdown
  Using cached Markdown-3.3.4-py3-none-any.whl (97 kB)
Building wheels for collected packages: thinkx
  Building wheel for thinkx (setup.py): started
  Building wheel for thinkx (setup.py): finished with status 'done'
  Created wheel for thinkx: filename=thinkx-1.1.3-py3-none-any.whl size=59946 sha256=7b3d77eecb09ede54e73a836e1c6cee3e1870345a825c8bc8dda30191b2eef31
  Stored in directory: c:\users\vasan\appdata\local\pip\cache\wheels\5e\fb\91\506f7f82cc754d3310b68a59ae3c08ca7fefe6d0e83ba2f570
Successfully built thinkx
Installing collected packages: markdown, thinkx
Successfully installed markdown-3.3.4 thinkx-1.1.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import thinkstats2

In [2]:
pmf=thinkstats2.Pmf([1,2,3,4,5])

In [3]:
pmf

Pmf({1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, 5: 0.2})

In [4]:
pmf.Prob(2)

0.2

In [5]:
pmf.Incr(2,0.3)

In [6]:
pmf.Prob(2)

0.5

In [7]:
pmf.Total()

1.2999999999999998

In [8]:
pmf.Normalize()

1.2999999999999998

In [9]:
pmf.Total()

1.0000000000000002

In [12]:
"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com

Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

from __future__ import print_function

import math
import numpy as np

import nsfg
import first
import thinkstats2
import thinkplot


def MakeHists(live):
    """Plot Hists for live births

    live: DataFrame
    others: DataFrame
    """
    hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
    thinkplot.PrePlot(2, cols=2)

    thinkplot.SubPlot(1)
    thinkplot.Hist(hist)
    thinkplot.Config(xlabel='years',
                     ylabel='frequency',
                     axis=[0, 45, 0, 700])

    thinkplot.SubPlot(2)
    thinkplot.Pmf(hist)

    thinkplot.Save(root='probability_agepreg_hist', 
                   xlabel='years',
                   axis=[0, 45, 0, 700])


def MakeFigures(firsts, others):
    """Plot Pmfs of pregnancy length.

    firsts: DataFrame
    others: DataFrame
    """
    # plot the PMFs
    first_pmf = thinkstats2.Pmf(firsts.prglngth, label='first')
    other_pmf = thinkstats2.Pmf(others.prglngth, label='other')
    width = 0.45

    thinkplot.PrePlot(2, cols=2)
    thinkplot.Hist(first_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Config(xlabel='weeks',
                     ylabel='probability',
                     axis=[27, 46, 0, 0.6])

    thinkplot.PrePlot(2)
    thinkplot.SubPlot(2)
    thinkplot.Pmfs([first_pmf, other_pmf])
    thinkplot.Save(root='probability_nsfg_pmf',
                   xlabel='weeks',
                   axis=[27, 46, 0, 0.6])

    # plot the differences in the PMFs
    weeks = range(35, 46)
    diffs = []
    for week in weeks:
        p1 = first_pmf.Prob(week)
        p2 = other_pmf.Prob(week)
        diff = 100 * (p1 - p2)
        diffs.append(diff)

    thinkplot.Bar(weeks, diffs)
    thinkplot.Save(root='probability_nsfg_diffs',
                   title='Difference in PMFs',
                   xlabel='weeks',
                   ylabel='percentage points',
                   legend=False)


def BiasPmf(pmf, label=''):
    """Returns the Pmf with oversampling proportional to value.

    If pmf is the distribution of true values, the result is the
    distribution that would be seen if values are oversampled in
    proportion to their values; for example, if you ask students
    how big their classes are, large classes are oversampled in
    proportion to their size.

    Args:
      pmf: Pmf object.
      label: string label for the new Pmf.

     Returns:
       Pmf object
    """
    new_pmf = pmf.Copy(label=label)

    for x, p in pmf.Items():
        new_pmf.Mult(x, x)
        
    new_pmf.Normalize()
    return new_pmf


def UnbiasPmf(pmf, label=''):
    """Returns the Pmf with oversampling proportional to 1/value.

    Args:
      pmf: Pmf object.
      label: string label for the new Pmf.

     Returns:
       Pmf object
    """
    new_pmf = pmf.Copy(label=label)

    for x, p in pmf.Items():
        new_pmf.Mult(x, 1.0/x)
        
    new_pmf.Normalize()
    return new_pmf


def ClassSizes():
    """Generate PMFs of observed and actual class size.
    """
    # start with the actual distribution of class sizes from the book
    d = { 7: 8, 12: 8, 17: 14, 22: 4, 
          27: 6, 32: 12, 37: 8, 42: 3, 47: 2 }

    # form the pmf
    pmf = thinkstats2.Pmf(d, label='actual')
    print('mean', pmf.Mean())
    print('var', pmf.Var())
    
    # compute the biased pmf
    biased_pmf = BiasPmf(pmf, label='observed')
    print('mean', biased_pmf.Mean())
    print('var', biased_pmf.Var())

    # unbias the biased pmf
    unbiased_pmf = UnbiasPmf(biased_pmf, label='unbiased')
    print('mean', unbiased_pmf.Mean())
    print('var', unbiased_pmf.Var())

    # plot the Pmfs
    thinkplot.PrePlot(2)
    thinkplot.Pmfs([pmf, biased_pmf])
    thinkplot.Save(root='class_size1',
                   xlabel='class size',
                   ylabel='PMF',
                   axis=[0, 52, 0, 0.27])
    
 
def main(script):
    live, firsts, others = first.MakeFrames()
    MakeFigures(firsts, others)
    MakeHists(live)

    ClassSizes()


if __name__ == '__main__':
    import sys
    main(*sys.argv)




In [14]:
first_pmf = thinkstats2.Pmf(firsts.prglngth, label='first')
other_pmf = thinkstats2.Pmf(others.prglngth, label='other')

NameError: name 'firsts' is not defined

In [15]:
d = { 7: 8, 12: 8, 17: 14, 22: 4,
27: 6, 32: 12, 37: 8, 42: 3, 47: 2 }

In [16]:
pmf=thinkstats2.Pmf(d,label='actual')
print(pmf.Mean())

23.692307692307693


In [17]:
import numpy as np
import pandas
array = np.random.randn(4, 2)
df = pandas.DataFrame(array)
df

Unnamed: 0,0,1
0,1.138006,-0.892723
1,-0.398057,-0.558606
2,0.681682,-1.339747
3,1.602662,-0.512628


In [18]:
columns = ['A', 'B']
df = pandas.DataFrame(array, columns=columns)
df

Unnamed: 0,A,B
0,1.138006,-0.892723
1,-0.398057,-0.558606
2,0.681682,-1.339747
3,1.602662,-0.512628


In [19]:
index = ['a', 'b', 'c', 'd']
df = pandas.DataFrame(array, columns=columns, index=index)
df

Unnamed: 0,A,B
a,1.138006,-0.892723
b,-0.398057,-0.558606
c,0.681682,-1.339747
d,1.602662,-0.512628


In [20]:
df['A']

a    1.138006
b   -0.398057
c    0.681682
d    1.602662
Name: A, dtype: float64

In [21]:
df.loc['a']

A    1.138006
B   -0.892723
Name: a, dtype: float64

In [22]:
df['a':'c']

Unnamed: 0,A,B
a,1.138006,-0.892723
b,-0.398057,-0.558606
c,0.681682,-1.339747


In [23]:
df[0:2]

Unnamed: 0,A,B
a,1.138006,-0.892723
b,-0.398057,-0.558606


In [25]:
"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com

Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

from __future__ import print_function, division

import numpy as np

import nsfg
import first

import thinkstats2
import thinkplot


def PercentileRank(scores, your_score):
    """Computes the percentile rank relative to a sample of scores."""
    count = 0
    for score in scores:
        if score <= your_score:
            count += 1

    percentile_rank = 100.0 * count / len(scores)
    return percentile_rank

scores = [55, 66, 77, 88, 99]
your_score = 88

print('score, percentile rank')
for score in scores:
    print(score, PercentileRank(scores, score))
print()

def Percentile(scores, percentile_rank):
    """Computes the value that corresponds to a given percentile rank. """
    scores.sort()
    for score in scores:
        if PercentileRank(scores, score) >= percentile_rank:
            return score

def Percentile2(scores, percentile_rank):
    """Computes the value that corresponds to a given percentile rank.

    Slightly more efficient.
    """
    scores.sort()
    index = percentile_rank * (len(scores)-1) // 100
    return scores[index]

print('prank, score, score')
for percentile_rank in [0, 20, 25, 40, 50, 60, 75, 80, 100]:
    print(percentile_rank, 
          Percentile(scores, percentile_rank),
          Percentile2(scores, percentile_rank))


def EvalCdf(sample, x):
    """Computes CDF(x) in a sample.

    sample: sequence
    x: value

    returns: cumulative probability
    """
    count = 0.0
    for value in sample:
        if value <= x:
            count += 1.0

    prob = count / len(sample)
    return prob

sample = [1, 2, 2, 3, 5]

print('x', 'CDF(x)')
for x in range(0, 7):
    print(x, EvalCdf(sample, x))



def PositionToPercentile(position, field_size):
    """Converts from position in the field to percentile.

    position: int
    field_size: int
    """
    beat = field_size - position + 1
    percentile = 100.0 * beat / field_size
    return percentile


def PercentileToPosition(percentile, field_size):
    """Converts from percentile to hypothetical position in the field.

    percentile: 0-100
    field_size: int
    """
    beat = percentile * field_size / 100.0
    position = field_size - beat + 1
    return position


# my time 42:44
print('Percentile rank in field', PositionToPercentile(97, 1633))
print('Percentile rank in age group', PositionToPercentile(26, 256))

percentile = PositionToPercentile(26, 256)
print('Equivalent position in M50-59', PercentileToPosition(percentile, 171))
# 17th place = 46:05
print('Equivalent position in F20-29', PercentileToPosition(percentile, 448))
# 48:28


def MakeExample():
    """Makes a simple example CDF."""
    t = [2, 1, 3, 2, 5]
    cdf = thinkstats2.Cdf(t)
    thinkplot.Clf()
    thinkplot.Cdf(cdf)
    thinkplot.Save(root='cumulative_example_cdf',
                   xlabel='x',
                   ylabel='CDF',
                   axis=[0, 6, 0, 1],
                   legend=False)    


def MakeFigures(live, firsts, others):
    """Creates several figures for the book.

    live: DataFrame
    firsts: DataFrame
    others: DataFrame
    """

    first_wgt = firsts.totalwgt_lb
    first_wgt_dropna = first_wgt.dropna()
    print('Firsts', len(first_wgt), len(first_wgt_dropna))
    #assert len(first_wgt_dropna) == 4381
 
    other_wgt = others.totalwgt_lb
    other_wgt_dropna = other_wgt.dropna()
    print('Others', len(other_wgt), len(other_wgt_dropna))
    #assert len(other_wgt_dropna) == 4706

    first_pmf = thinkstats2.Pmf(first_wgt_dropna, label='first')
    other_pmf = thinkstats2.Pmf(other_wgt_dropna, label='other')

    width = 0.4 / 16

    # plot PMFs of birth weights for first babies and others
    thinkplot.PrePlot(2)
    thinkplot.Hist(first_pmf, align='right', width=width)
    thinkplot.Hist(other_pmf, align='left', width=width)
    thinkplot.Save(root='cumulative_birthwgt_pmf',
                   title='Birth weight',
                   xlabel='weight (pounds)',
                   ylabel='PMF')

    # plot CDFs of birth weights for first babies and others
    first_cdf = thinkstats2.Cdf(firsts.totalwgt_lb, label='first')
    other_cdf = thinkstats2.Cdf(others.totalwgt_lb, label='other')

    thinkplot.PrePlot(2)
    thinkplot.Cdfs([first_cdf, other_cdf])
    thinkplot.Save(root='cumulative_birthwgt_cdf',
                   title='Birth weight',
                   xlabel='weight (pounds)',
                   ylabel='CDF',
                   axis=[0, 12.5, 0, 1]
                   )


def MakeCdf(live):
    """Plot the CDF of pregnancy lengths for live births.
   
    live: DataFrame for live births
    """
    cdf = thinkstats2.Cdf(live.prglngth, label='prglngth')
    thinkplot.Cdf(cdf)
    thinkplot.Save('cumulative_prglngth_cdf',
                   title='Pregnancy length',
                   xlabel='weeks',
                   ylabel='CDF')


def RandomFigure(live):
    weights = live.totalwgt_lb
    cdf = thinkstats2.Cdf(weights, label='totalwgt_lb')

    sample = np.random.choice(weights, 100, replace=True)
    ranks = [cdf.PercentileRank(x) for x in sample]

    rank_cdf = thinkstats2.Cdf(ranks, label='percentile ranks')
    thinkplot.Cdf(rank_cdf)
    thinkplot.Save(root='cumulative_random',
                   xlabel='percentile rank',
                   ylabel='CDF')


def TestSample(live):
    """Plots the distribution of weights against a random sample.

    live: DataFrame for live births
    """
    weights = live.totalwgt_lb
    cdf = thinkstats2.Cdf(weights, label='totalwgt_lb')

    sample = cdf.Sample(1000)
    sample_cdf = thinkstats2.Cdf(sample, label='sample')

    thinkplot.PrePlot(2)
    thinkplot.Cdfs([cdf, sample_cdf])
    thinkplot.Save(root='cumulative_sample',
                   xlabel='weight (pounds)',
                   ylabel='CDF')


def main(name, data_dir=''):
    thinkstats2.RandomSeed(17)

    MakeExample()
    live, firsts, others = first.MakeFrames()
    RandomFigure(live)
    TestSample(live)
    MakeCdf(live)
    MakeFigures(live, firsts, others)


score, percentile rank
55 20.0
66 40.0
77 60.0
88 80.0
99 100.0

prank, score, score
0 55 55
20 55 55
25 66 66
40 66 66
50 77 77
60 77 77
75 88 88
80 88 88
100 99 99
x CDF(x)
0 0.0
1 0.2
2 0.6
3 0.8
4 0.8
5 1.0
6 1.0
Percentile rank in field 94.12124923453766
Percentile rank in age group 90.234375
Equivalent position in M50-59 17.69921875
Equivalent position in F20-29 44.75
