In [1]:
import os
import pandas as pd

In [2]:
os.listdir('data/largefile/IAMR73')

['IAIR73.DOC',
 'IAMR73.DCF',
 'IAMR73.DTA',
 'IAMR73.FRQ',
 'IAMR73.FRW',
 'IAMR73.MAP']

In [3]:
itr = pd.read_stata('data/largefile/IAMR73/IAMR73.DTA', chunksize=100000)

ValueError: Version of given Stata file is not 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)

# present data as is

In [31]:
itr.nvar # number of variables

428

In [33]:
itr.nobs # number of rows

2869043

In [39]:
list(itr.variable_labels().items())[0:20] # how does these variables look like

[('hhid', 'case identification'),
 ('hvidx', 'line number'),
 ('hv000', 'country code and phase'),
 ('hv001', 'cluster number'),
 ('hv002', 'household number'),
 ('hv003', "respondent's line number (answering household questionnaire)"),
 ('hv004', 'ultimate area unit'),
 ('hv005', 'household sample weight (6 decimals)'),
 ('hv006', 'month of interview'),
 ('hv007', 'year of interview'),
 ('hv008', 'date of interview (cmc)'),
 ('hv009', 'number of household members'),
 ('hv010', 'number of eligible women in household'),
 ('hv011', 'number of eligible men in household'),
 ('hv012', 'number of de jure members'),
 ('hv013', 'number of de facto members'),
 ('hv014', 'number of children 5 and under (de jure)'),
 ('hv015', 'result of household interview'),
 ('hv016', 'day of interview'),
 ('hv017', 'number of visits')]

In [None]:
# categorical value description

In [48]:
list(itr.value_labels().items())[0:3]

[('HV003', {0: 'incomplete household'}),
 ('HV015',
  {1: 'completed',
   2: 'no household member/no competent member at home',
   3: 'entire household absent for extended period of time',
   4: 'postponed',
   5: 'refused',
   6: 'dwelling vacant or address not a dwelling',
   7: 'dwelling destroyed',
   8: 'dwelling not found',
   9: 'other'}),
 ('HV020', {0: 'all woman sample', 1: 'ever married sample'})]

# load data by chunk

In [68]:
# set chunk size 
CHUNK_SIZE = 100000

In [69]:
itr = pd.read_stata('IAPR73FL.DTA', chunksize=CHUNK_SIZE)

In [70]:
itr.get_chunk().shape # it might take a while

(100000, 428)

In [71]:
first_batch=itr.get_chunk()

In [72]:
first_batch.head(10)

Unnamed: 0,hhid,hvidx,hv000,hv001,hv002,hv003,hv004,hv005,hv006,hv007,...,hml32a,hml32b,hml32c,hml32d,hml32e,hml32f,hml32g,hml33,hml34,hml35
100000,3067731,3,IA6,30677,31,3,677,49000,11,2016,...,,,,,,,,,,
100001,3067731,4,IA6,30677,31,3,677,49000,11,2016,...,,,,,,,,,,
100002,3067734,1,IA6,30677,34,1,677,49000,11,2016,...,,,,,,,,,,
100003,3067734,2,IA6,30677,34,1,677,49000,11,2016,...,,,,,,,,,,
100004,3067737,1,IA6,30677,37,2,677,49000,11,2016,...,,,,,,,,,,
100005,3067737,2,IA6,30677,37,2,677,49000,11,2016,...,,,,,,,,,,
100006,3067737,3,IA6,30677,37,2,677,49000,11,2016,...,,,,,,,,,,
100007,3067737,4,IA6,30677,37,2,677,49000,11,2016,...,,,,,,,,,,
100008,3067737,5,IA6,30677,37,2,677,49000,11,2016,...,,,,,,,,,,
100009,3067737,6,IA6,30677,37,2,677,49000,11,2016,...,,,,,,,,,,


# rename variables


In [73]:
first_batch = first_batch.rename(columns=itr.variable_labels())
first_batch.head()

Unnamed: 0,case identification,line number,country code and phase,cluster number,household number,respondent's line number (answering household questionnaire),ultimate area unit,household sample weight (6 decimals),month of interview,year of interview,...,na - presence of species: falciparum (pf),na - presence of species: malariae (pm),na - presence of species: ovale (po),na - presence of species: vivax (pv),na - presence of species: cs,na - presence of species: cs.1,na - presence of species: cs.2,na - result of malaria measurement,bar code for blood smear sample,na - result of malaria rapid test
100000,3067731,3,IA6,30677,31,3,677,49000,11,2016,...,,,,,,,,,,
100001,3067731,4,IA6,30677,31,3,677,49000,11,2016,...,,,,,,,,,,
100002,3067734,1,IA6,30677,34,1,677,49000,11,2016,...,,,,,,,,,,
100003,3067734,2,IA6,30677,34,1,677,49000,11,2016,...,,,,,,,,,,
100004,3067737,1,IA6,30677,37,2,677,49000,11,2016,...,,,,,,,,,,


In [74]:
first_batch.to_csv("sample_IAPR73FL.csv")

# how many batch are there ? 

In [76]:
itr.nobs/CHUNK_SIZE

28.69043

In [None]:
# Further analysis

In [None]:
def do_something:
    pass

In [None]:
for chunk in itr:
    do_something(chunk)