### Child marriage data first look
Turns out, this data is hyper-refined and difficult to work with.

In [1]:
# load packages
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
import pyreadstat # this reads in the spss file

#### Load wm.sav
This returns a pandas dataframe and a file with metadata from spss. It's confusing because some of the boolean (true/valse) variables come in as numeric data types (0.0 and 1.0), and other categorical variables read as object data types. 

In [2]:
# apply_value_formats is by default False, so you have to set it to True manually if you want the labels
# formats_as_category is by default True, and it means the replaced values will be transformed to a pandas category column.
# df2, meta = pyreadstat.read_sav("wm.sav", apply_value_formats=False, formats_as_category=True)

In [3]:
df2, meta2 = pyreadstat.read_sav('wm.sav')

In [4]:
df2.head()

Unnamed: 0,HH1,HH2,LN,WM1,WM2,WM4,WM5,WM6D,WM6M,WM6Y,...,CDEAD,welevel,religion,wmweight,wscore,windex5,wscoreu,windex5u,wscorer,windex5r
0,1.0,1.0,2.0,1.0,1.0,2.0,117.0,14.0,2.0,2013.0,...,0.0,1.0,1.0,0.468799,-0.588731,2.0,-1.038469,1.0,,
1,1.0,2.0,2.0,1.0,2.0,2.0,117.0,14.0,2.0,2013.0,...,0.0,2.0,1.0,0.468799,1.525888,5.0,0.474476,3.0,,
2,1.0,3.0,2.0,1.0,3.0,2.0,117.0,14.0,2.0,2013.0,...,0.0,5.0,1.0,0.468799,-0.104456,3.0,-0.691985,2.0,,
3,1.0,4.0,2.0,1.0,4.0,2.0,117.0,14.0,2.0,2013.0,...,0.0,1.0,1.0,0.468799,-1.20344,1.0,-1.478274,1.0,,
4,1.0,4.0,3.0,1.0,4.0,3.0,117.0,14.0,2.0,2013.0,...,0.0,2.0,1.0,0.468799,-1.20344,1.0,-1.478274,1.0,,


In [5]:
# replace the column names with column labels
df2.columns = meta2.column_labels
# to go back to column names
# df.columns = meta.column_names

In [6]:
df2.head()
# print(meta2.colum_names)
# print(meta2.column_labels)
# print(meta2.number_rows)
# print(meta2.number_columns)
# print(meta2.file_label)
# print(meta2.file_encoding)
# there are other metadata pieces extracted. See the documentation for more details.

Unnamed: 0,Cluster number,Household number,Line number,Cluster number.1,Household number.1,Woman's line number,Interviewer number,Day of interview,Month of interview,Year of interview,...,Children dead,Education,Religion of household head,Women's sample weight,Combined wealth score,Wealth index quintile,Urban wealth score,Urban wealth index quintile,Rural wealth score,Rural wealth index quintile
0,1.0,1.0,2.0,1.0,1.0,2.0,117.0,14.0,2.0,2013.0,...,0.0,1.0,1.0,0.468799,-0.588731,2.0,-1.038469,1.0,,
1,1.0,2.0,2.0,1.0,2.0,2.0,117.0,14.0,2.0,2013.0,...,0.0,2.0,1.0,0.468799,1.525888,5.0,0.474476,3.0,,
2,1.0,3.0,2.0,1.0,3.0,2.0,117.0,14.0,2.0,2013.0,...,0.0,5.0,1.0,0.468799,-0.104456,3.0,-0.691985,2.0,,
3,1.0,4.0,2.0,1.0,4.0,2.0,117.0,14.0,2.0,2013.0,...,0.0,1.0,1.0,0.468799,-1.20344,1.0,-1.478274,1.0,,
4,1.0,4.0,3.0,1.0,4.0,3.0,117.0,14.0,2.0,2013.0,...,0.0,2.0,1.0,0.468799,-1.20344,1.0,-1.478274,1.0,,


In [7]:
print(meta2.file_encoding)

UTF-8


In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59599 entries, 0 to 59598
Columns: 235 entries, Cluster number to Rural wealth index quintile
dtypes: float64(160), object(75)
memory usage: 106.9+ MB


#### Categorical (object) variables

In [9]:
df2.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59599 entries, 0 to 59598
Data columns (total 75 columns):
Last birth in last two years                                       59599 non-null object
Antenatal care: Doctor                                             59599 non-null object
Antenatal care: Nurse / Midwife                                    59599 non-null object
Antenatal care: Auxiliary midwife                                  59599 non-null object
Antenatal care: Traditional birth attendant                        59599 non-null object
Antenatal care: Community health worker                            59599 non-null object
Antenatal care: Other                                              59599 non-null object
Assistance at delivery: Doctor                                     59599 non-null object
Assistance at delivery: Nurse / Midwife                            59599 non-null object
Assistance at delivery: Auxiliary midwife                          59599 non-null object
A

#### Missing data
Code below calculates the percent of NaN data in each variable, but unfortunately it's not counting all the empty cells in variables of type object. See below. Apparently NaN only registers in numeric variables.

In [66]:
# percent missing NaN
(df2.shape[0]-df2.count())/59599

Cluster number                 0.000000
Household number               0.000000
Line number                    0.000000
Cluster number                 0.000000
Household number               0.000000
Woman's line number            0.000000
Interviewer number             0.000000
Day of interview               0.000000
Month of interview             0.000000
Year of interview              0.000000
                                 ...   
Children dead                  0.131009
Education                      0.131009
Religion of household head     0.000000
Women's sample weight          0.000000
Combined wealth score          0.000000
Wealth index quintile          0.000000
Urban wealth score             0.718804
Urban wealth index quintile    0.718804
Rural wealth score             0.150187
Rural wealth index quintile    0.150187
Length: 235, dtype: float64

In [70]:
np.count_nonzero(df2['Urban wealth score'].isnull())/59599


0.7188040067786372

#### Example numeric var
NaNs are counted here.

In [71]:
df2['Urban wealth score'].value_counts(dropna=False).head()


NaN          42840
 0.000000     7808
 2.249036        7
 2.287165        6
 1.176316        5
Name: Urban wealth score, dtype: int64

#### Example object var
NaNs are blank or empty.

In [72]:
df2['Antenatal care: Doctor'].value_counts(dropna=False)

     55609
A     3988
?        2
Name: Antenatal care: Doctor, dtype: int64

In [10]:
# get a colunm index value
df2.columns.get_loc('Antenatal care: Doctor')

67

In [11]:
df2.iloc[:,67].value_counts(dropna=False)

     55609
A     3988
?        2
Name: Antenatal care: Doctor, dtype: int64

In [12]:
# access multiple columns
df2.iloc[:, 67:73].head()

Unnamed: 0,Antenatal care: Doctor,Antenatal care: Nurse / Midwife,Antenatal care: Auxiliary midwife,Antenatal care: Traditional birth attendant,Antenatal care: Community health worker,Antenatal care: Other
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,


### Variables spread across multiple columns
Another example: Information about some variables, like Antenatal Care, are spread across several columns. The type of caregiver before childbirth can be A, G, B, C, ? or F. 

In [13]:
# melt Antenatal Care into one colum and count frequency of each.
test_long = pd.melt(df2.iloc[:, 67:72],
                    var_name='Antenatal care',
                    value_name='Caregiver')

In [14]:
test_long['Caregiver'].value_counts(dropna=False)

     292533
A      3988
G       718
B       677
C        48
F        21
?        10
Name: Caregiver, dtype: int64

#### Second example: Assistance at delivery

In [15]:
# get column start index
df2.columns.get_loc('Assistance at delivery: Doctor')

83

In [16]:
# check results
df2.iloc[:, 83:91].head()

Unnamed: 0,Assistance at delivery: Doctor,Assistance at delivery: Nurse / Midwife,Assistance at delivery: Auxiliary midwife,Assistance at delivery: Traditional birth attendant,Assistance at delivery: Community health worker,Assistance at delivery: Relative / Friend,Assistance at delivery: Other,Assistance at delivery: No one
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,


In [17]:
# count frequencies for this var
test_long2 = pd.melt(df2.iloc[:, 83:91],
                    var_name='Assistance at delivery',
                    value_name='Provider')

In [20]:
test_long2['Provider'].value_counts(dropna=False)

     465453
H      3733
A      2100
F      2045
B      1786
C       723
?       528
X       244
G       138
Y        42
Name: Provider, dtype: int64

### Except that there are combined situations
When multiple providers are present at delivery, there are combinations -- 63 different cases.

In [21]:
# concatenate values from multiple provider type columns
col_cat = df2.iloc[:,83].str.cat(df2.iloc[:,84:91])

In [22]:
col_cat.value_counts(dropna=False)

         51733
H         2354
F         1237
AB         922
A          654
FH         606
B          418
C          354
ABH        204
CH         180
         ...  
ABCFG        1
BCGH         1
ABCG         1
BHX          1
BX           1
ACFX         1
BFG          1
ACG          1
ACGH         1
AFGH         1
Name: Assistance at delivery: Doctor, Length: 63, dtype: int64