In [1]:
!pip install thinkx

Collecting thinkx
[?25l  Downloading https://files.pythonhosted.org/packages/34/7a/43e0e92fbd96916a11a78ce637b34e9e9416fa48f9d3ef90927234e86c36/thinkx-1.1.3.tar.gz (41kB)
[K    100% |████████████████████████████████| 51kB 5.6MB/s ta 0:00:01
Collecting markdown (from thinkx)
[?25l  Downloading https://files.pythonhosted.org/packages/c0/4e/fd492e91abdc2d2fcb70ef453064d980688762079397f779758e055f6575/Markdown-3.1.1-py2.py3-none-any.whl (87kB)
[K    100% |████████████████████████████████| 92kB 12.5MB/s ta 0:00:01
Building wheels for collected packages: thinkx
  Running setup.py bdist_wheel for thinkx ... [?25ldone
[?25h  Stored in directory: /home/beakerx/.cache/pip/wheels/74/e9/57/f4ad676ba30144b574a6cc36606e82874f8aac8271112f1cd3
Successfully built thinkx
Installing collected packages: markdown, thinkx
Successfully installed markdown-3.1.1 thinkx-1.1.3


In [2]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import re
import thinkstats2

In [3]:
dataset_dir = '../datasets/pregnancies'

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

In [4]:
%%bash
cd ../datasets/pregnancies
rm -f 2002FemPreg.dat.gz 2002FemPreg.dct
wget https://raw.githubusercontent.com/AllenDowney/ThinkStats2/master/code/2002FemPreg.dat.gz
wget https://raw.githubusercontent.com/AllenDowney/ThinkStats2/master/code/2002FemPreg.dct

--2019-08-10 00:59:30--  https://raw.githubusercontent.com/AllenDowney/ThinkStats2/master/code/2002FemPreg.dat.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.164.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.164.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1041415 (1017K) [application/octet-stream]
Saving to: ‘2002FemPreg.dat.gz’

     0K .......... .......... .......... .......... ..........  4% 6.58M 0s
    50K .......... .......... .......... .......... ..........  9% 10.4M 0s
   100K .......... .......... .......... .......... .......... 14% 23.2M 0s
   150K .......... .......... .......... .......... .......... 19% 15.2M 0s
   200K .......... .......... .......... .......... .......... 24% 12.6M 0s
   250K .......... .......... .......... .......... .......... 29% 9.57M 0s
   300K .......... .......... .......... .......... .......... 34% 12.4M 0s
   350K .......... .......... .......

In [5]:
dct = thinkstats2.ReadStataDct(os.path.join(dataset_dir, '2002FemPreg.dct'), encoding='iso-8859-1')
df = dct.ReadFixedWidth(os.path.join(dataset_dir, '2002FemPreg.dat.gz'), compression='gzip')

In [6]:
# Source: https://github.com/AllenDowney/ThinkStats2/blob/master/code/nsfg2.py#L47
def CleanFemPreg(df):
    """Recodes variables from the pregnancy frame.
    df: DataFrame
    """
    # mother's age is encoded in centiyears; convert to years
    df.agepreg /= 100.0

    # birthwgt_lb contains at least one bogus value (51 lbs)
    # replace with NaN
    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
    
    # replace 'not ascertained', 'refused', 'don't know' with NaN
    na_vals = [97, 98, 99]
    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)

    # birthweight is stored in two columns, lbs and oz.
    # convert to a single column in lb
    # NOTE: creating a new column requires dictionary syntax,
    # not attribute assignment (like df.totalwgt_lb)
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    

    # due to a bug in ReadStataDct, the last variable gets clipped;
    # so for now set it to NaN
    df.phase = np.nan
    return df

df = CleanFemPreg(df)

In [7]:
df = df[df['outcome'].isin([1, 3, 4])]
df['outcome'] = df['outcome'] == 1
df = df[[ c for c in df.columns.values if c != 'outcome' ] + ['outcome']]

In [8]:
g = df.groupby('outcome')
df = g.apply(lambda x: x.sample(g.size().min(), random_state=1)).reset_index(drop=True)

In [9]:
df = df.sort_values('cmprgend')

In [10]:
df = df[['parity', 'agecon', 'pregordr', 'poverty', 'educat', 'outcome']]

In [11]:
df['parity'] = df['parity'].apply(lambda x: max(x-1, 0))

In [12]:
df.to_csv(os.path.join(dataset_dir, 'pregnancies.csv'), index=False)