In [15]:
import pandas as pd
import numpy as np
import scipy
import matplotlib
import urllib.request as ur
import math
import re
import zlib
from collections import defaultdict
from math import isnan
%matplotlib notebook
import matplotlib.pyplot as plt
import pyFunctions as pF



femPregDCT = ur.urlopen("https://raw.githubusercontent.com/AllenDowney/ThinkStats2/master/code/2002FemPreg.dct")

femRespDCT = ur.urlopen("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dct")

pregDataZip = "https://github.com/AllenDowney/ThinkStats2/blob/master/code/2002FemPreg.dat.gz?raw=true"

respDataZip = "https://github.com/AllenDowney/ThinkStats2/blob/master/code/2002FemResp.dat.gz?raw=true"


In [4]:
fPDCT = []

fRDCT = []

for i in femPregDCT:
	fPDCT.append(i.decode("utf-8") )
	
for i in femRespDCT:
	fRDCT.append(i.decode("utf-8") )

In [5]:
class FixedWidthVariables(object):
    """Represents a set of variables in a fixed width file."""
    def __init__(self, variables, index_base=0):
        """Initializes.
        variables: DataFrame
        index_base: are the indices 0 or 1 based?
        Attributes:
        colspecs: list of (start, end) index tuples
        names: list of string variable names
        """
        self.variables = variables
        # note: by default, subtract 1 from colspecs
        self.colspecs = variables[['start', 'end']] - index_base
        # convert colspecs to a list of pair of int
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
        self.names = variables['name']

    def ReadFixedWidth(self, filename, **options):
        """Reads a fixed width ASCII file.
        filename: string filename
        returns: DataFrame
        """
        df = pd.read_fwf(filename,
                             colspecs=self.colspecs, 
                             names=self.names,
                             **options)
        return df

def ReadStataDct(dct_file):
	type_map = dict(byte=int, int=int, long=int, float=float, double=float)
	var_info = []
	for line in dct_file:
			match = re.search( r'_column\(([^)]*)\)', line)
			if match:
				start = int(match.group(1))
				t = line.split()
				vtype, name, fstring = t[1:4]
				name = name.lower()
				if vtype.startswith('str'):
					vtype = str
				else:
					vtype = type_map[vtype]
				long_desc = ' '.join(t[4:]).strip('"')
				var_info.append((start, vtype, name, fstring, long_desc))
	columns = ['start', 'type', 'name', 'fstring', 'desc']
	variables = pd.DataFrame(var_info, columns=columns)
	variables['end'] = variables.start.shift(-1)
	variables.loc[len(variables)-1, 'end'] = 0
	dct = FixedWidthVariables(variables, index_base=1)
	return dct

def CleanFemPreg(df):
    """Recodes variables from the pregnancy frame.
    df: DataFrame
    """
    # mother's age is encoded in centiyears; convert to years
    df.agepreg /= 100.0
    # birthwgt_lb contains at least one bogus value (51 lbs)
    # replace with NaN
    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
    # replace 'not ascertained', 'refused', 'don't know' with NaN
    na_vals = [97, 98, 99]
    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
    df.hpagelb.replace(na_vals, np.nan, inplace=True)
    df.babysex.replace([7, 9], np.nan, inplace=True)
    df.nbrnaliv.replace([9], np.nan, inplace=True)
    # birthweight is stored in two columns, lbs and oz.
    # convert to a single column in lb
    # NOTE: creating a new column requires dictionary syntax,
    # not attribute assignment (like df.totalwgt_lb)
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    
    # due to a bug in ReadStataDct, the last variable gets clipped;
    # so for now set it to NaN
    df.cmintvw = np.nan

In [6]:
dct = ReadStataDct(fPDCT)
preg = dct.ReadFixedWidth(pregDataZip,compression='gzip' )
CleanFemPreg(preg)

In [7]:
dct = ReadStataDct(fRDCT)
resp = dct.ReadFixedWidth(respDataZip,compression='gzip' )

In [8]:
merged = pd.merge(resp,preg, how = 'inner',left_on = 'caseid',right_on = 'caseid')

In [9]:
RaceMap = {5:"White",4:"Black",3:"Other",2:"Other",1:"Other"}
OutComeMap = {1:"Live",2:"Abortion",3:"StillBorn",4:"Miscarriage",5:"Ectopic",6:"Current"}

In [10]:
##merged['race'] = merged['rscreenrace'].apply(lambda x: RaceMap[x])
merged['outcomedesc'] = merged['outcome'].apply(lambda x: OutComeMap[x])
bins = [15,30,50]
merged['AgeBuckets'] = pd.cut(merged['agepreg'],bins)
merged['AgeRounded'] = merged.agepreg.apply(lambda x: round(x,0))

In [11]:
def CleanHist(df,column):
    hist = {}
    t = df[column]
    for x in t:
        hist[x] = hist.get(x,0) + 1
    z = {x: hist[x] for x in hist if not isnan(x)}
    plt.bar(z.keys(),z.values())

In [12]:
live = merged[merged.outcome == 1]

In [13]:
firsts = live[live.birthord == 1] 
others = live[live.birthord != 1]
under30 = merged[merged.agepreg < 30]
over30 = merged[merged.agepreg >= 30]

In [14]:
MeanCompare = {}
MeanCompare['first'] = firsts.prglngth.mean()
MeanCompare['others'] = others.prglngth.mean()
MeanCompare

{'first': 38.60095173351461, 'others': 38.52291446673706}

In [14]:
def CohenEffectSize(group1,group2):
    diff = group1.mean() - group2.mean()
    var1 = group1.var()
    var2 = group2.var()
    n1,n2 = len(group1), len(group2)
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    d = diff / math.sqrt(pooled_var)
    return d

In [30]:
##zip(*sorted(firsts.prglngth.to_dict()))
zip(*sorted(firsts.prglngth.to_dict().items()))

<zip at 0x7fb8d384f148>

In [16]:
pF._Brewer.InitIter(2)

<IPython.core.display.Javascript object>

In [None]:
##def Bar2(df1,df2,xlabel,ylabel,limit = None, width = 0.45)
##    w = width

In [15]:
CohenEffectSize(firsts.prglngth,others.prglngth)

0.028879044654449834

In [17]:
CohenEffectSize(under30.outcome,over30.outcome)

  import sys


nan

In [44]:
CleanHist(live,'AgeRounded')

<IPython.core.display.Javascript object>

In [9]:
xaxis = clean_hist.keys()
yaxis = clean_hist.values()
plt.bar(xaxis,yaxis)

<IPython.core.display.Javascript object>

<BarContainer object of 16 artists>

In [29]:
racecounts = pd.DataFrame(resp['rscreenrace'].value_counts()).reset_index()

In [22]:
RowTotal = len(resp)

In [31]:
racecounts['Mix'] = racecounts.rscreenrace / RowTotal

In [34]:
racecounts['race'] = racecounts['index'].apply(lambda x: RaceMap[x])

In [72]:
plt.bar(racecounts['race'],racecounts['rscreenrace'])

<IPython.core.display.Javascript object>

<BarContainer object of 5 artists>

In [36]:

AgeOutCome = merged[merged.outcome.isin(['1','3','4','5'])].pivot_table(values = 'caseid', index = 'AgeBuckets' , columns = 'outcomedesc' ,aggfunc='count')

In [37]:
AgeOutCome

outcomedesc,Ectopic,Live,Miscarriage,StillBorn
AgeBuckets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(15, 30]",125,7260,1416,98
"(30, 50]",65,1820,467,20


In [38]:
AgeOutCome['Totals'] = AgeOutCome['Ectopic'] + AgeOutCome['Live'] + AgeOutCome['Miscarriage'] + AgeOutCome['StillBorn']

In [39]:
AgeOutCome['EP'] = (AgeOutCome['Ectopic'] / AgeOutCome['Totals'])*100
AgeOutCome['LP'] = (AgeOutCome['Live'] / AgeOutCome['Totals'])*100
AgeOutCome['MP'] = (AgeOutCome['Miscarriage'] / AgeOutCome['Totals'])*100
AgeOutCome['SP'] = (AgeOutCome['StillBorn'] / AgeOutCome['Totals'])*100
AgeOutCome

outcomedesc,Ectopic,Live,Miscarriage,StillBorn,Totals,EP,LP,MP,SP
AgeBuckets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(15, 30]",125,7260,1416,98,8899,1.404652,81.5822,15.9119,1.101247
"(30, 50]",65,1820,467,20,2372,2.740304,76.728499,19.688027,0.84317


In [25]:
plt.scatter(merged.agepreg,merged.outcomedesc,alpha=0.5)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7fbce921a908>

In [55]:
merged.outcome.value_counts()

1    9148
4    1921
2    1862
6     352
5     190
3     120
Name: outcome, dtype: int64