In [None]:
! pip install --quiet environs cyksuid toolz psycopg2-binary typing_json backoff xxhash pyyaml

In [None]:
! pip install --quiet git+https://github.com/nandanrao/facebook-python-business-sdk@pagination

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from environs import Env
env = Env()
env.read_env('.env-bug', override=True)

In [5]:
from adopt.malaria import get_conf, get_df, lookalike
from adopt.marketing import Marketing

cnf = get_conf(env)
stratum = cnf['strata'][0]
surveys = stratum['surveys']
df = get_df(cnf)
# m = Marketing(env)

In [6]:
from adopt.malaria import load_cities

cities = load_cities('output/cities.csv')

districts = cities.groupby('distname') \
                  .head(1) \
                  .reset_index(drop=True) \
                  .drop(columns=['rad', 'lng', 'lat', 'distcode', 'id'])

In [7]:
from adopt.responses import get_forms
from datetime import datetime

survey_user = '10383123-9fb1-429b-8312-447c1b54b17a'
shortcodes = [ 'baselineeng', 'baselinehin', 'baselineodi']

timestamp = datetime(2020, 8, 5)

forms = list(get_forms(survey_user, shortcodes, timestamp, cnf['chatbase']))

In [None]:
forms[0]

In [14]:
questions = [
    ('malaria_incidence', 'f6e69027-97cc-494e-8d52-318b75047e23'),
    ('malaria_now', 'dad23031-8468-4900-89cc-d01841d8b660'),
    ('dist_medical', 'd7573919-8a7e-457f-9a1d-1f8c389127a7'),
    ('education', 'e40fa1c6-13a1-4a02-91cd-0eaade11864d'),
    ('owns_net', 'bd4802c6-7bdb-40f0-aac1-18cc6df7da6e'),
    ('has_ac', 'e279587c-975f-433a-adab-1ad563876af6'),
    ('net_lastnight', 'b5b1ff58-c8df-4890-9b1c-0cd40ce6edc0'),
    ('home', '4d0ae478-3893-4b46-ab39-d6848c69245d'), 
    ('occupation', '4fc929c7-132d-49b1-a164-515e5cc9064f')]


In [15]:
from adopt.responses import get_response_df
from datetime import datetime, timezone
from adopt.clustering import only_target_users
import pandas as pd
from adopt.clustering import add_res_cols
from adopt.forms import response_translator
from adopt.malaria import shape_df


def filter_time(df, min_date, max_date):
    min_date = datetime(*min_date, tzinfo=timezone.utc)
    max_date = datetime(*max_date, tzinfo=timezone.utc)

    users = df[(df.timestamp > min_date) & (df.timestamp < max_date)].userid.unique()
    return df[df.userid.isin(users)]

def ref_translation(eng_form, other_form):
    vals = [(a['ref'], b['ref']) for a, b 
            in zip(eng_form['fields'], other_form['fields'])]
    lookup = dict(vals)
    rev_lookup = {v:k for k,v in lookup.items()}
    return lookup, rev_lookup


def get_filtered_responses(survey_user, eng_form, other_form, shortcodes, questions, db_cnf):
    ref_lookup, rev_ref_lookup = ref_translation(eng_form, other_form)
    refs = [ref_lookup[r] for _, r in questions]
    responses = get_response_df(survey_user, shortcodes, refs, db_cnf)

    _, last_ref = questions[-1]
    
    answered = responses[responses.question_ref == last_ref].userid.unique()    

    return responses[responses.userid.isin(answered)].reset_index(drop=True)

def malaria_prob(groupby, key='malaria'):
    d = groupby[key].value_counts(normalize=True)
    d.name = 'count'
    d = d.reset_index(level=-1)
    d = d[d[key] == True].drop(columns=key)
    return d


def stats(df):
    targets = df.kutcha.sum()
    tot = df.shape[0]
    non_target = tot - targets
    return pd.Series([targets/tot, targets, non_target, tot], index=['perc', 'target', 'non_target', 'tot'])

def col_translators(a, b, questions):
    lookup, _ = ref_translation(a, b)

    refs = [ref for _, ref in questions]
    ts = { qt['ref']: response_translator(q, qt)
           for qt, q in zip(a['fields'], b['fields'])                   
           if qt['ref'] in refs}
    
    return [(n, lookup[ref], ts[ref]) for n, ref in questions]


In [None]:
filtered_res = get_filtered_responses(survey_user, forms[0], forms[1], ['baselinehin', 'baselinehinexc'], questions, cnf['chatbase'])

col_names = col_translators(forms[0], forms[1], questions)
rr = add_res_cols(col_names, shape_df(filtered_res)) \
    .reset_index(drop=True) \
    .dropna()

In [17]:
dd = rr.merge(districts, left_on='md:clusterid', right_on='disthash')

dd['under_net'] = dd.net_lastnight == 'Yes'
dd['malaria'] = dd.malaria_incidence == 'Yes'
dd['malaria_now'] = dd.malaria_now == 'Yes'
dd['kutcha'] = dd.home == 'Kutcha (made of mud, tin, straw)'

In [18]:
perc = dd.groupby('disthash').apply(stats).reset_index()

In [23]:
from adopt.clustering import get_budget_lookup
from adopt.malaria import window, days_left, get_df, get_cluster_from_adset
from adopt.marketing import BudgetWindow
from adopt.facebook.state import CampaignState

# w = BudgetWindow(datetime(2020,8,10), datetime.now())

w = window(8)
state = CampaignState(env, w)

spend = {get_cluster_from_adset(n): i
         for n, i in state.spend.items()}

INFO:root:Campaign impact-evaluation-vlab-11 has 286 creatives, and 109 running ads


In [49]:
import json

with open('config/strata-kutcha.json') as f:
    stratum = json.loads(f.read())['strata'][0]

In [51]:
budget_lookup = get_budget_lookup(df,
                                  stratum,
                                  float('inf'),
                                  100000,
                                  90,
                                  10,
                                  w,
                                  spend)







In [None]:
bb = pd.Series(budget_lookup) / 100

with pd.option_context('display.max_rows', None):
    display(bb.sort_values()[:80])

In [54]:
bb.sort_values()[:80].sum() / 70 * 10

112608.93142857144

In [None]:
def add_budget(r, lookup):
    r['budget'] = lookup.get(r['disthash'], None)
    return r

perc = perc.apply(lambda r: add_budget(r, budget_lookup), 1)

In [None]:
with pd.option_context('display.max_rows', None):
    display(perc.sort_values('tot')[-90:])

In [64]:
filter_time(dd, (2020,7,1), (2020,8,1)).home.value_counts(1)

Pucca (have cement/brick wall and floor    0.578773
Semi-pucca                                 0.243160
Kutcha (made of mud, tin, straw)           0.178067
Name: home, dtype: float64

In [145]:
dd[(dd.home == 'Pucca (have cement/brick wall and floor') & (dd.has_ac == 'Yes')].shape

(598, 34)

In [65]:
filter_time(dd, (2020,8,11), (2020,8,12)).home.value_counts(1)

Pucca (have cement/brick wall and floor    0.526802
Semi-pucca                                 0.252002
Kutcha (made of mud, tin, straw)           0.221195
Name: home, dtype: float64

In [29]:
audienced = districts[districts.include_audience == True].disthash

In [105]:
filter_time(dd[dd.disthash.isin(audienced)], (2020,8,1), (2020,8,8)).home.value_counts(1)

Pucca (have cement/brick wall and floor    0.423507
Semi-pucca                                 0.294776
Kutcha (made of mud, tin, straw)           0.281716
Name: home, dtype: float64

In [106]:
filter_time(dd[dd.disthash.isin(audienced)], (2020,8,8), (2020,8,9)).home.value_counts(1)

Pucca (have cement/brick wall and floor    0.505935
Semi-pucca                                 0.270030
Kutcha (made of mud, tin, straw)           0.224036
Name: home, dtype: float64

In [30]:
malaria_prob(filter_time(dd[dd.disthash.isin(audienced)], (2020,8,8), (2020,8,9)).groupby('disthash'), 'kutcha').mean()

count    0.328153
dtype: float64

In [31]:
malaria_prob(filter_time(dd[dd.disthash.isin(audienced)], (2020,8,9), (2020,8,10)).groupby('disthash'), 'kutcha').mean()

count    0.42933
dtype: float64

In [32]:
malaria_prob(filter_time(dd[dd.disthash.isin(audienced)], (2020,8,10), (2020,8,11)).groupby('disthash'), 'kutcha').mean()

count    0.394834
dtype: float64

In [76]:
malaria_prob(filter_time(dd, (2020,7,1), (2020,8,1)).groupby(['disthash']), 'kutcha').quantile(.5)

count    0.2
Name: 0.5, dtype: float64

In [40]:
malaria_prob(filter_time(dd, (2020,8,7), (2020,8,11)).groupby(['disthash']), 'kutcha').quantile(.5)

count    0.25
Name: 0.5, dtype: float64

In [35]:
malaria_prob(filter_time(dd, (2020,8,1), (2020,8,10)).groupby(['disthash']), 'kutcha').quantile(.5)

count    0.25
Name: 0.5, dtype: float64

In [346]:
from marketing import get_cluster_from_adset

running = [get_cluster_from_adset(s) for s in m.running_ads.keys()]
base_cities = pd.read_csv('output/base-cities.csv')
cities = base_cities[base_cities.disthash.isin(running)].reset_index(drop=True)

target_kutchas = perc[(perc.perc < .35) & (perc.tot > 70)].disthash
cities['include_audience'] = cities.disthash.isin(target_kutchas)
cities.to_csv('output/cities.csv', index=False)

In [44]:
dd.home.value_counts()

Pucca (have cement/brick wall and floor    3867
Semi-pucca                                 1749
Kutcha (made of mud, tin, straw)           1468
Name: home, dtype: int64

In [60]:
high_malaria_dists = dd.groupby('disthash').malaria.mean().reset_index().pipe(lambda df: df[df.malaria > .25]).disthash
low_malaria_dists = dd.groupby('disthash').malaria.mean().reset_index().pipe(lambda df: df[df.malaria < .15]).disthash
# low_malaria_dists = perc[(perc.perc == 0.0) & (perc.target >= 0)].disthash.unique()
# low_malaria_dists = cities[cities.include_audience == True].disthash.unique()

dd['dist_risk'] = 'med'
dd.loc[dd['md:clusterid'].isin(low_malaria_dists), 'dist_risk'] = 'low'
dd.loc[dd['md:clusterid'].isin(high_malaria_dists), 'dist_risk'] = 'high'

In [None]:
dd.groupby('disthash').head(1)['dist_risk'].value_counts()

In [None]:
malaria_prob(dd.groupby('home'))

In [None]:
dd.groupby('education').has_ac.value_counts(1)

In [62]:
malaria_prob(dd.groupby(['home']), 'malaria_now')

Unnamed: 0_level_0,count
home,Unnamed: 1_level_1
"Kutcha (made of mud, tin, straw)",0.028401
Pucca (have cement/brick wall and floor,0.01036
Semi-pucca,0.016762


In [None]:
malaria_prob(dd.groupby(['home', 'has_ac']), 'malaria_now')

In [68]:
tot = 1345928 + 196952 + 17682 + 40222 + 89997 + 113061 + 50145 + 141910 + 45000

In [69]:
tot / 70

29155.67142857143

35000

In [55]:
malaria_prob(dd.groupby(['home']), 'malaria_now')

Unnamed: 0_level_0,count
home,Unnamed: 1_level_1
"Kutcha (made of mud, tin, straw)",0.029933
Pucca (have cement/brick wall and floor,0.010189
Semi-pucca,0.016833


In [48]:
# dd['include_audience'] = dd.dist_risk == 'low'

# cities = cities.merge(dd.groupby('disthash').head(1).reset_index(drop=True)[['disthash', 'include_audience']])

# cities.to_csv('output/cities.csv', index=False)

In [283]:
dd.groupby('dist_risk').malaria.value_counts()

dist_risk  malaria
high       False       921
           True        510
low        False      1927
           True        230
med        False      1736
           True        401
Name: malaria, dtype: int64

In [168]:
filter_time(dd, (2020,7,21), (2020,7,25)) \
    .pipe(lambda df: df[df.disthash.isin(low_malaria_dists)]).shape[0] / filter_time(dd, (2020,7,21), (2020,7,25)).shape[0]

0.48735408560311283

In [169]:
filter_time(dd, (2020,8,1), (2020,8,10)) \
    .pipe(lambda df: df[df.disthash.isin(low_malaria_dists)]).shape[0] / filter_time(dd, (2020,8,1), (2020,8,10)).shape[0]

0.26901062959934585

In [216]:
filter_time(dd[dd.disthash.isin(low_malaria_dists)], (2020,7,21), (2020,7,24)).home.value_counts(normalize=True)

Pucca (have cement/brick wall and floor    0.601583
Semi-pucca                                 0.262533
Kutcha (made of mud, tin, straw)           0.135884
Name: home, dtype: float64

In [215]:
filter_time(dd[dd.disthash.isin(low_malaria_dists)], (2020,8,1), (2020,8,7)).home.value_counts(normalize=True)

Pucca (have cement/brick wall and floor    0.494845
Semi-pucca                                 0.268041
Kutcha (made of mud, tin, straw)           0.237113
Name: home, dtype: float64

In [214]:
filter_time(dd[dd.disthash.isin(low_malaria_dists)], (2020,8,7), (2020,8,8)).home.value_counts(normalize=True)

Pucca (have cement/brick wall and floor    0.385965
Kutcha (made of mud, tin, straw)           0.315789
Semi-pucca                                 0.298246
Name: home, dtype: float64

In [32]:
saturated = perc[perc.non_target >= 150].disthash

In [34]:
saturated

23     2e0a86c3
31     3c73baaa
32     419b24e9
35     4c6cbda0
54     786f66b7
59     8ed5f728
86     cbea7cbf
87     ce04df02
103    e77e76a7
Name: disthash, dtype: object

In [35]:
cities['creative_group'] = 'hindi'
cities.loc[cities.disthash.isin(saturated), 'creative_group'] = 'exclusions'

In [36]:
cities['include_audience'] = False

In [None]:
cities

In [38]:
cities.to_csv('output/cities.csv', index=False)

In [502]:
# district fixed effects of puccaness

dd['pucca'] = dd.home != 'Kutcha (made of mud, tin, straw)'

malaria_prob(dd.groupby(['disthash', 'pucca'])).reset_index() \
    .groupby('disthash') \
    .filter(lambda df: df.shape[0] > 1) \
    .groupby('disthash') \
    .apply(lambda df: df.iloc[0]['count'] - df.iloc[1]['count']).quantile(0.5)

0.03352007469654528

In [227]:
malaria_prob(filter_time(dd, (2020,7,21), (2020,8,1)).groupby(['dist_risk', 'home']))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
dist_risk,home,Unnamed: 2_level_1
high,"Kutcha (made of mud, tin, straw)",0.425703
high,Pucca (have cement/brick wall and floor,0.322284
high,Semi-pucca,0.354839
low,"Kutcha (made of mud, tin, straw)",0.136719
low,Pucca (have cement/brick wall and floor,0.090982
low,Semi-pucca,0.135021
med,"Kutcha (made of mud, tin, straw)",0.185965
med,Pucca (have cement/brick wall and floor,0.188024
med,Semi-pucca,0.218919


In [252]:
malaria_prob(filter_time(dd, (2020,8,6), (2020,8,9)).groupby(['dist_risk', 'home']))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
dist_risk,home,Unnamed: 2_level_1
high,"Kutcha (made of mud, tin, straw)",0.395522
high,Pucca (have cement/brick wall and floor,0.297101
high,Semi-pucca,0.415842
low,"Kutcha (made of mud, tin, straw)",0.125
low,Pucca (have cement/brick wall and floor,0.040541
low,Semi-pucca,0.155556
med,"Kutcha (made of mud, tin, straw)",0.104294
med,Pucca (have cement/brick wall and floor,0.163934
med,Semi-pucca,0.21875


In [246]:
malaria_prob(filter_time(dd, (2020,7,8), (2020,8,1)).groupby(['dist_risk']))

Unnamed: 0_level_0,count
dist_risk,Unnamed: 1_level_1
high,0.354808
low,0.107505
med,0.195302


In [250]:
malaria_prob(filter_time(dd, (2020,8,6), (2020,8,9)).groupby(['dist_risk']))

Unnamed: 0_level_0,count
dist_risk,Unnamed: 1_level_1
high,0.364611
low,0.09434
med,0.16242


In [None]:
filter_time(dd, (2020,7,6), (2020,8,1)).malaria.value_counts(normalize=True)

In [238]:
filter_time(dd, (2020,7,6), (2020,8,1)).groupby('dist_risk').home.value_counts(normalize=True)

dist_risk  home                                   
high       Pucca (have cement/brick wall and floor    0.522115
           Kutcha (made of mud, tin, straw)           0.239423
           Semi-pucca                                 0.238462
low        Pucca (have cement/brick wall and floor    0.629817
           Semi-pucca                                 0.240365
           Kutcha (made of mud, tin, straw)           0.129817
med        Pucca (have cement/brick wall and floor    0.560403
           Semi-pucca                                 0.248322
           Kutcha (made of mud, tin, straw)           0.191275
Name: home, dtype: float64

In [241]:
filter_time(dd, (2020,8,7), (2020,8,9)).groupby('dist_risk').home.value_counts(normalize=True)

dist_risk  home                                   
high       Pucca (have cement/brick wall and floor    0.363914
           Kutcha (made of mud, tin, straw)           0.342508
           Semi-pucca                                 0.293578
low        Pucca (have cement/brick wall and floor    0.443182
           Kutcha (made of mud, tin, straw)           0.295455
           Semi-pucca                                 0.261364
med        Pucca (have cement/brick wall and floor    0.493827
           Semi-pucca                                 0.253968
           Kutcha (made of mud, tin, straw)           0.252205
Name: home, dtype: float64

In [243]:
filter_time(dd, (2020,7,6), (2020,8,1)).malaria.value_counts(normalize=True)

False    0.806308
True     0.193692
Name: malaria, dtype: float64

In [245]:
filter_time(dd, (2020,8,6), (2020,8,9)).malaria.value_counts(normalize=True)

False    0.781897
True     0.218103
Name: malaria, dtype: float64

In [198]:
filter_time(dd, (2020,7,6), (2020,8,1)).groupby('dist_risk').home.value_counts(normalize=True)

dist_risk  home                                   
high       Pucca (have cement/brick wall and floor    0.522115
           Kutcha (made of mud, tin, straw)           0.239423
           Semi-pucca                                 0.238462
low        Pucca (have cement/brick wall and floor    0.625341
           Semi-pucca                                 0.238667
           Kutcha (made of mud, tin, straw)           0.135991
med        Pucca (have cement/brick wall and floor    0.571429
           Semi-pucca                                 0.249540
           Kutcha (made of mud, tin, straw)           0.179031
Name: home, dtype: float64

In [196]:
filter_time(dd, (2020,8,6), (2020,8,9)).groupby('dist_risk').home.value_counts(normalize=True)

dist_risk  home                                   
high       Pucca (have cement/brick wall and floor    0.369973
           Kutcha (made of mud, tin, straw)           0.359249
           Semi-pucca                                 0.270777
low        Pucca (have cement/brick wall and floor    0.475410
           Semi-pucca                                 0.268852
           Kutcha (made of mud, tin, straw)           0.255738
med        Pucca (have cement/brick wall and floor    0.485477
           Kutcha (made of mud, tin, straw)           0.259336
           Semi-pucca                                 0.255187
Name: home, dtype: float64

In [None]:
filter_time(dd, (2020,8,1), (2020,9,1)).pipe(lambda df: df[df.dist_risk == 'low'])

In [399]:
malaria_prob(dd.groupby(['dist_risk', 'under_net']))

KeyError: 'dist_risk'

In [398]:
malaria_prob(dd.groupby(['under_net']))

Unnamed: 0_level_0,count
under_net,Unnamed: 1_level_1
False,0.190691
True,0.208074


In [403]:
malaria_prob(dd.groupby(['dist_risk', 'home']))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
dist_risk,home,Unnamed: 2_level_1
high,"Kutcha (made of mud, tin, straw)",0.404711
high,Pucca (have cement/brick wall and floor,0.308943
high,Semi-pucca,0.363636
low,"Kutcha (made of mud, tin, straw)",0.129747
low,Pucca (have cement/brick wall and floor,0.086731
low,Semi-pucca,0.140481
med,"Kutcha (made of mud, tin, straw)",0.164969
med,Pucca (have cement/brick wall and floor,0.17199
med,Semi-pucca,0.202055


In [None]:
malaria_prob(dd.groupby(['owns_net']))

In [409]:
dd.owns_net.value_counts()

Yes           4597
No            1580
Don't know      71
Name: owns_net, dtype: int64

In [418]:
dd.groupby(['has_ac']).owns_net.value_counts()

has_ac      owns_net  
Don't know  Yes            265
            No              71
            Don't know      32
No          Yes           3903
            No            1288
            Don't know      31
Yes         Yes            429
            No             221
            Don't know       8
Name: owns_net, dtype: int64

In [431]:
dd.groupby(['dist_risk']).home.value_counts(1)

dist_risk  home                                   
high       Pucca (have cement/brick wall and floor    0.493127
           Kutcha (made of mud, tin, straw)           0.267468
           Semi-pucca                                 0.239404
low        Pucca (have cement/brick wall and floor    0.611514
           Semi-pucca                                 0.245240
           Kutcha (made of mud, tin, straw)           0.143246
med        Pucca (have cement/brick wall and floor    0.531794
           Semi-pucca                                 0.254355
           Kutcha (made of mud, tin, straw)           0.213850
Name: home, dtype: float64

In [430]:
dd.groupby(['dist_risk', 'malaria']).under_net.value_counts(1)

dist_risk  malaria  under_net
high       False    True         0.561896
                    False        0.438104
           True     True         0.584843
                    False        0.415157
low        False    True         0.508621
                    False        0.491379
           True     True         0.508547
                    False        0.491453
med        False    True         0.543190
                    False        0.456810
           True     True         0.555012
                    False        0.444988
Name: under_net, dtype: float64

In [405]:
malaria_prob(dd.groupby(['dist_risk', 'under_net']))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
dist_risk,under_net,Unnamed: 2_level_1
high,False,0.335553
high,True,0.356784
low,False,0.106089
low,True,0.106061
med,False,0.17433
med,True,0.18131


In [428]:
dd.groupby(['home', 'has_ac']).under_net.value_counts(1)

home                                     has_ac      under_net
Kutcha (made of mud, tin, straw)         Don't know  True         0.553957
                                                     False        0.446043
                                         No          True         0.593315
                                                     False        0.406685
                                         Yes         True         0.655172
                                                     False        0.344828
Pucca (have cement/brick wall and floor  Don't know  True         0.538462
                                                     False        0.461538
                                         No          True         0.519481
                                                     False        0.480519
                                         Yes         False        0.622093
                                                     True         0.377907
Semi-pucca                           

In [426]:
dd.groupby(['home']).owns_net.value_counts(1)

home                                     owns_net  
Kutcha (made of mud, tin, straw)         Yes           0.737049
                                         No            0.247253
                                         Don't know    0.015699
Pucca (have cement/brick wall and floor  Yes           0.718449
                                         No            0.270184
                                         Don't know    0.011367
Semi-pucca                               Yes           0.773169
                                         No            0.219054
                                         Don't know    0.007777
Name: owns_net, dtype: float64

In [422]:
malaria_prob(dd.groupby(['home', 'dist_risk', 'under_net']))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
home,dist_risk,under_net,Unnamed: 3_level_1
"Kutcha (made of mud, tin, straw)",high,False,0.369565
"Kutcha (made of mud, tin, straw)",high,True,0.427562
"Kutcha (made of mud, tin, straw)",low,False,0.103704
"Kutcha (made of mud, tin, straw)",low,True,0.149171
"Kutcha (made of mud, tin, straw)",med,False,0.154229
"Kutcha (made of mud, tin, straw)",med,True,0.172414
Pucca (have cement/brick wall and floor,high,False,0.300771
Pucca (have cement/brick wall and floor,high,True,0.315678
Pucca (have cement/brick wall and floor,low,False,0.100407
Pucca (have cement/brick wall and floor,low,True,0.070261


In [423]:
malaria_prob(dd.groupby(['state', 'dist_risk', 'under_net']))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
state,dist_risk,under_net,Unnamed: 3_level_1
Chhatisgarh,high,False,0.431373
Chhatisgarh,high,True,0.283186
Chhatisgarh,low,False,0.1
Chhatisgarh,low,True,0.117647
Chhatisgarh,med,False,0.127273
Chhatisgarh,med,True,0.19209
Jharkhand,high,False,0.425743
Jharkhand,high,True,0.395639
Jharkhand,low,False,0.092593
Jharkhand,low,True,0.124138


In [None]:
X = dd[['education', 'home', 'dist_medical', 'has_ac', 'owns_net']]


for col in X.columns:
    X[col] = X[col].astype('category')
    X[col] = X[col].cat.codes

In [None]:
X

In [423]:
X.shape

(4474, 5)

In [425]:
y = dd['malaria']

In [None]:
y.shape

In [542]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X[:2000], y[:2000])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:

from sklearn.tree import plot_tree

plot_tree(model)

In [300]:
dd[(dd.owns_net == 'Yes') & 
   (dd.has_ac == 'No') & 
   (dd.home == 'Kutcha (made of mud, tin, straw)')] \
   .malaria_incidence.value_counts(normalize=True)

No            0.636550
Yes           0.268994
Don’t know    0.094456
Name: malaria_incidence, dtype: float64

In [None]:
dd

In [330]:
def times(df, mi, ma):
    return df[(df.timestamp > mi) & (df.timestamp < ma)]    


times(dd, datetime(2020,7,22,tzinfo=timezone.utc), datetime(2020,7,24,tzinfo=timezone.utc)) \
    ['has_ac'] \
    .value_counts(normalize=True)

No            0.846106
Yes           0.113693
Don't know    0.040201
Name: has_ac, dtype: float64

In [331]:
times(dd, datetime(2020,7,24,tzinfo=timezone.utc), datetime(2020,7,25,tzinfo=timezone.utc)) \
    ['has_ac'] \
    .value_counts(normalize=True)

No            0.797244
Yes           0.147638
Don't know    0.055118
Name: has_ac, dtype: float64

In [332]:
times(dd, datetime(2020,7,25,tzinfo=timezone.utc), datetime(2020,7,26,tzinfo=timezone.utc)) \
    ['has_ac'] \
    .value_counts(normalize=True)

No            0.862589
Yes           0.090426
Don't know    0.046986
Name: has_ac, dtype: float64

In [328]:
times(dd, datetime(2020,7,26,tzinfo=timezone.utc), datetime(2020,7,28,tzinfo=timezone.utc)) \
    ['has_ac'] \
    .value_counts(normalize=True)

No            0.825040
Yes           0.126806
Don't know    0.048154
Name: has_ac, dtype: float64

In [None]:
dd

In [301]:
dd.malaria_incidence.value_counts(normalize=True)

No            0.670988
Yes           0.191775
Don’t know    0.137237
Name: malaria_incidence, dtype: float64

In [None]:
# responses should be converted and translated...