In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
wefarm = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/wefarm.parquet")

In [4]:
wefarm["response_sent_dt"] = pd.to_datetime(wefarm["response_sent"], format="ISO8601")
wefarm["question_set_dt"] = pd.to_datetime(wefarm["question_sent"], format="ISO8601")

In [5]:

def parse_datetime_col(df, colstr):
  df[f'{colstr}_year'] = df[colstr].dt.year
  df[f'{colstr}_month'] = df[colstr].dt.month
  df[f'{colstr}_day'] = df[colstr].dt.day
  df[f'{colstr}_hour'] = df[colstr].dt.hour
  df[f'{colstr}_minute'] = df[colstr].dt.minute
  df[f'{colstr}_second'] = df[colstr].dt.second
  df[f'{colstr}_day_name'] = df[colstr].dt.day_name()
  df[f'{colstr}_day_of_week'] = df[colstr].dt.day_of_week
  df[f'{colstr}_month_name'] = df[colstr].dt.month_name()
  df[f'{colstr}_quarter'] = df[colstr].dt.quarter
  df[f'{colstr}_week'] = df[colstr].dt.isocalendar().week
  df[f'{colstr}_date'] = df[colstr].dt.date
  df[f'{colstr}_time'] = df[colstr].dt.time

  return df

In [6]:
wefarm = parse_datetime_col(wefarm, "response_sent_dt")
wefarm = parse_datetime_col(wefarm, "question_set_dt")

In [7]:
niche = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/en_questions_cat_niche.parquet")
broad = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/en_questions_cat_broad.parquet")

In [8]:
all = pd.merge(wefarm, niche, on="question_id").merge(
  broad, on="question_id"
)

In [9]:
import json

with open("/content/drive/MyDrive/DataKit_WeFarm/segment_user_ids.json", "r") as r:
    segment_user_id_dct = json.load(r)

In [10]:
segment_user_id_dct.keys()

dict_keys(['user_activity_post_count', 'speed_post_response', 'unique_askers', 'tenure'])

In [11]:
# segment_user_id_dct["user_activity_post_count"].keys()

In [12]:
from collections import defaultdict

user_segments = defaultdict(dict)
for agg, dct in segment_user_id_dct.items():
  for segment, user_ids in dct.items():
    for user_id in user_ids:
      user_segments[user_id][agg] = segment

row_lst = []
for user_id, segments in user_segments.items():
  row = {"user_id": user_id}
  row.update(segments)
  row_lst.append(row)

user_segments_df = pd.DataFrame(row_lst)

In [13]:
all1 = all.merge(user_segments_df, left_on="question_user_id", right_on="user_id", how="inner")

In [14]:
all2 = all1[all1["question_user_country_code"].isin(["ug", "ke"])]

In [15]:
all3 = all2.drop_duplicates("question_id")

In [16]:
all3["question_user_country_code"] = [j.upper() for j in all3["question_user_country_code"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all3["question_user_country_code"] = [j.upper() for j in all3["question_user_country_code"]]


# define topics lists

In [17]:
all3.iloc[:5, 54:173]

Unnamed: 0,financial-inclusion_price,financial-inclusion_sell,financial-inclusion_buy,diversification_obtain,diversification_acquire,question_what,question_when,question_how,question_which,crop_harvest,climate_time,disease_harvest,livestock_livestock,livestock_animals,livestock_sheep,livestock_cattle,livestock_donkey,livestock_pig,livestock_cow,livestock_goat,livestock_camel,livestock_hen,livestock_chicken,livestock_poultry,livestock_bee,diversification_expand,diversification_begin,crop_plant,crop_variety,disease_plant,crop_seed,crop_grow,crop_mulch,diversification_grow,crop_crop,crop_potatoes,crop_passion,crop_maize,crop_banana,crop_coffee,crop_tea,crop_onions,crop_bean,crop_cabbage,climate_rain,diversification_add,question_who,disease_rabbit,diversification_use,crop_tomatoes,crop_rice,disease_disease,disease_care,crop_keep,disease_keep,financial-inclusion_market,financial-inclusion_cost,question_many,crop_layer,disease_treat,disease_prevent,disease_chemical,crop_fruit,livestock_milk,crop_leave,crop_land,disease_leave,livestock_egg,livestock_breed,crop_manure,diversification_increase,climate_season,disease_control,climate_sun,crop_yield,livestock_dairy,crop_soil,diversification_irrigation,diversification_scale,financial-inclusion_money,question_where,diversification_best,disease_spray,disease_turn,disease_attack,financial-inclusion_fee,crop_type,climate_climate,disease_medicine,disease_tick,disease_weed,disease_pests,financial-inclusion_finance,financial-inclusion_investment,financial-inclusion_loan,diversification_new,livestock_lay,climate_heat,climate_drought,diversification_clear,disease_black,disease_space,crop_fertilizer,disease_sick,climate_temperature,climate_flood,disease_affect,financial-inclusion_sum,disease_unhealthy,financial-inclusion_bank,financial-inclusion_insurance,disease_mean,financial-inclusion_credit,diversification_acreage,diversification_diversify,diversification_register,climate_water,diversification_shift,financial-inclusion_economy
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,1.0,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1.0,1.0,,,,1.0,1.0,1.0,1.0,,1.0,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,,,,,,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1.0,1.0,1.0,1.0,,1.0,,,1.0,,,,,,,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [18]:
niche_collst = all3.columns[54:173]

In [19]:
all3.iloc[:5, -12:-5]

Unnamed: 0,financial-inclusion,diversification,question,crop,climate,disease,livestock
0,1.0,1.0,1.0,,,,
1,,,1.0,1.0,1.0,1.0,1.0
2,,1.0,1.0,1.0,,1.0,
4,1.0,1.0,1.0,1.0,1.0,1.0,
8,,1.0,1.0,1.0,,1.0,1.0


In [20]:
broad_collst = all3.columns[-12:-5]

#colors

In [21]:
import plotly.express as px

segment1_color = "#A1A1A1"
segment2_color = "#F9C926"
segment3_color = "#DC7326"
segment4_color = "#4897CD"
segment5_color = "#2E9999"

In [22]:
segment_colors_dct = {
  "1": segment1_color,
  "2": segment2_color,
  "3": segment3_color,
  "4": segment4_color,
  "5": segment5_color
}

# map and pie

In [23]:
vc_country = all3["question_user_country_code"].value_counts()

# segments distribution tab

In [24]:
segmentation_collst = list(segment_user_id_dct.keys())

gb_segments_dct = {}
for segmentation in segmentation_collst:
  gb_segments_dct[segmentation] = all3.groupby([
    "question_user_country_code",
    segmentation
  ]).size().reset_index(name="n")

# topic sums

In [25]:
import numpy as np

gb_country_segment_b_topic_dct = {}
for segmentation in segmentation_collst:
  gb_sum = all3.groupby([
    "question_user_country_code",
    segmentation
  ])[broad_collst].sum()
  gb_count = all3.groupby([
    "question_user_country_code",
    segmentation
  ]).size()
  row_sums = gb_sum[broad_collst].sum(axis=1)
  gb_pct = gb_sum[broad_collst].div(row_sums, axis=0)
  gb_se = np.sqrt(gb_pct * ( 1- gb_pct)).div(np.sqrt(gb_count), axis=0)

  gb_pct = gb_pct.reset_index()
  gb_se = gb_se.reset_index()
  gb_sum = gb_sum.reset_index()

  gb_sum["count"] = gb_count.values

  gb_pct_long = gb_pct.melt(
    id_vars=["question_user_country_code", segmentation],
    value_vars=broad_collst,
    var_name="category",
    value_name="pct"
  )

  gb_se_long = gb_se.melt(
    id_vars=["question_user_country_code", segmentation],
    value_vars=broad_collst,
    var_name="category",
    value_name="se"
  )

  gb_long = gb_pct_long.merge(
    gb_se_long,
    on=["question_user_country_code", segmentation, "category"]
  ).merge(
    gb_sum[["question_user_country_code", segmentation, "count"]],
    on=["question_user_country_code", segmentation]
  ).sort_values(
    by="category"
  )

  gb_long["segment_color"] = gb_long[segmentation].map(segment_colors_dct)

  gb_country_segment_b_topic_dct[segmentation] = gb_long


In [26]:
gb_country_segment_b_topic_dct["user_activity_post_count"]

Unnamed: 0,question_user_country_code,user_activity_post_count,category,pct,se,count,segment_color
41,KE,2,climate,0.056527,0.000637,131321,#F9C926
48,UG,4,climate,0.048071,0.000384,310612,#4897CD
47,UG,3,climate,0.048781,0.000457,222005,#DC7326
46,UG,2,climate,0.047917,0.00058,135611,#F9C926
40,KE,1,climate,0.054767,0.000457,247890,#A1A1A1
45,UG,1,climate,0.046726,0.000434,236296,#A1A1A1
44,KE,5,climate,0.057686,0.000336,482648,#2E9999
43,KE,4,climate,0.058065,0.00038,377920,#4897CD
42,KE,3,climate,0.057804,0.000495,221901,#DC7326
49,UG,5,climate,0.045607,0.000426,239638,#2E9999


In [27]:
gb_country_segment_n_topic_dct = {}
for segmentation in segmentation_collst:
  gb_sum = all3.groupby([
    "question_user_country_code",
    segmentation
  ])[niche_collst].sum()
  gb_count = all3.groupby([
    "question_user_country_code",
    segmentation
  ]).size()

  row_sums = gb_sum[niche_collst].sum(axis=1)
  gb_pct = gb_sum[niche_collst].div(row_sums, axis=0)
  gb_se = np.sqrt(gb_pct * (1 - gb_pct)).div(np.sqrt(gb_count), axis=0)

  gb_pct = gb_pct.reset_index()
  gb_se = gb_se.reset_index()
  gb_sum = gb_sum.reset_index()

  gb_sum['count'] = gb_count.values

  gb_pct_long = gb_pct.melt(
    id_vars=["question_user_country_code", segmentation],
    value_vars=niche_collst,
    var_name="category",
    value_name="pct"
  )

  gb_se_long = gb_se.melt(
    id_vars=["question_user_country_code", segmentation],
    value_vars=niche_collst,
    var_name="category",
    value_name="se"
  )

  gb_long = gb_pct_long.merge(
    gb_se_long,
    on=["question_user_country_code", segmentation, "category"]
  ).merge(
    gb_sum[["question_user_country_code", segmentation, "count"]],
    on=["question_user_country_code", segmentation]
  )

  gb_long["broad_type"] = gb_long["category"].str.split("_").str[0]
  gb_long["niche"] = gb_long["category"].str.split("_").str[1]

  gb_long = gb_long.sort_values(
      by=["broad_type", "pct", "niche"],
      ascending=[True, False, True]
  )

  gb_long["segment_color"] = gb_long[segmentation].map(segment_colors_dct)

  gb_country_segment_n_topic_dct[segmentation] = gb_long

In [28]:
gb_country_segment_n_topic_dct["user_activity_post_count"]

Unnamed: 0,question_user_country_code,user_activity_post_count,category,pct,se,count,broad_type,niche,segment_color
102,KE,3,climate_time,0.008287,0.000192,221901,climate,time,#DC7326
101,KE,2,climate_time,0.008126,0.000248,131321,climate,time,#F9C926
103,KE,4,climate_time,0.008113,0.000146,377920,climate,time,#4897CD
100,KE,1,climate_time,0.007991,0.000179,247890,climate,time,#A1A1A1
107,UG,3,climate_time,0.007874,0.000188,222005,climate,time,#DC7326
106,UG,2,climate_time,0.007766,0.000238,135611,climate,time,#F9C926
104,KE,5,climate_time,0.007751,0.000126,482648,climate,time,#2E9999
105,UG,1,climate_time,0.007615,0.000179,236296,climate,time,#A1A1A1
108,UG,4,climate_time,0.007533,0.000155,310612,climate,time,#4897CD
109,UG,5,climate_time,0.006766,0.000167,239638,climate,time,#2E9999


In [29]:
# gb_country_segment_n_topic_dct = {}
# for segmentation in segmentation_collst:
#   gb = all3.groupby([
#     "question_user_country_code",
#     segmentation
#   ])[niche_collst].sum().reset_index()

#   row_sums = gb[niche_collst].sum(axis=1)
#   gb[niche_collst] = gb[niche_collst].div(row_sums, axis=0)

#   gb_long = gb.melt(
#     id_vars=["question_user_country_code", segmentation],
#     value_vars=gb.columns[2:],
#     var_name="category",
#     value_name="pct"
#   )

#   gb_long["broad_type"] = gb_long["category"].str.split("_").str[0]
#   gb_long["niche"] = gb_long["category"].str.split("_").str[1]

#   gb_long = gb_long.sort_values(
#     by=["broad_type", "pct", "niche"],
#     ascending=[True, False, True]
#   )

#   gb_long["segment_color"] = gb_long[segmentation].map(segment_colors_dct)

#   gb_country_segment_n_topic_dct[segmentation] = gb_long


#time slices

In [30]:
time_collst = [
  "question_set_dt_year",
  "question_set_dt_month",
  "question_set_dt_hour",
  "question_set_dt_day_of_week"
]

In [40]:
gb_country_segment_b_topic_time_dct = defaultdict(dict)
for segmentation in segmentation_collst:
  for time in time_collst:
    gb_sum = all3.groupby([
      "question_user_country_code",
      segmentation,
      time
    ])[broad_collst].sum()

    gb_count = all3.groupby([
      "question_user_country_code",
      segmentation,
      time
    ]).size()

    row_sums = gb_sum[broad_collst].sum(axis=1)
    gb_pct = gb_sum[broad_collst].div(row_sums, axis=0)
    gb_se = np.sqrt(gb_pct * (1 - gb_pct)).div(np.sqrt(gb_count), axis=0)

    gb_pct = gb_pct.reset_index()
    gb_se = gb_se.reset_index()
    gb_sum = gb_sum.reset_index()
    gb_sum['count'] = gb_count.values

    gb_pct_long = gb_pct.melt(
      id_vars=["question_user_country_code", segmentation, time],
      value_vars=broad_collst,
      var_name="category",
      value_name="pct"
    )

    gb_se_long = gb_se.melt(
      id_vars=["question_user_country_code", segmentation, time],
      value_vars=broad_collst,
      var_name="category",
      value_name="se"
    )

    gb_long = gb_pct_long.merge(
      gb_se_long,
      on=["question_user_country_code", segmentation, time, "category"]
    ).merge(
      gb_sum[["question_user_country_code", segmentation, time, "count"]],
      on=["question_user_country_code", segmentation, time]
    ).sort_values(
      by=time,
      ascending=False
    )

    gb_long["segment_color"] = gb_long[segmentation].map(segment_colors_dct)

    gb_country_segment_b_topic_time_dct[segmentation][time] = gb_long

In [32]:
# gb_country_segment_b_topic_time_dct = defaultdict(dict)
# for segmentation in segmentation_collst:
#   for time in time_collst:
#     gb = all3.groupby([
#       "question_user_country_code",
#       segmentation,
#       time
#     ])[broad_collst].sum().reset_index()

#     row_sums = gb[broad_collst].sum(axis=1)
#     gb[broad_collst ] = gb[broad_collst].div(row_sums, axis=0)

#     gb_long = gb.melt(
#       id_vars=["question_user_country_code", segmentation, time],
#       value_vars=gb.columns[2:],
#       var_name="category",
#       value_name="pct"
#     )

#     gb_long["segment_color"] = gb_long[segmentation].map(segment_colors_dct)

#     gb_country_segment_b_topic_time_dct[segmentation][time] = gb_long

In [33]:
gb_country_segment_b_topic_time_dct["speed_post_response"]["question_set_dt_year"]

Unnamed: 0,question_user_country_code,speed_post_response,question_set_dt_year,category,pct,se,count,segment_color
0,KE,1,2017,financial-inclusion,0.102541,0.005497,3046,#A1A1A1
35,UG,2,2017,financial-inclusion,0.09062,0.006287,2085,#F9C926
45,UG,4,2017,financial-inclusion,0.107625,0.010856,815,#4897CD
40,UG,3,2017,financial-inclusion,0.095777,0.006309,2176,#DC7326
55,KE,1,2017,diversification,0.125742,0.006008,3046,#A1A1A1
50,UG,5,2017,financial-inclusion,0.105479,0.012287,625,#2E9999
325,UG,5,2017,disease,0.183562,0.015485,625,#2E9999
61,KE,2,2017,diversification,0.131825,0.006185,2992,#F9C926
330,KE,1,2017,livestock,0.128575,0.006065,3046,#A1A1A1
336,KE,2,2017,livestock,0.129903,0.006146,2992,#F9C926


In [44]:
gb_country_segment_n_topic_time_dct = defaultdict(dict)
for segmentation in segmentation_collst:
  for time in time_collst:
    gb_sum = all3.groupby([
      "question_user_country_code",
      segmentation,
      time
    ])[niche_collst].sum()

    gb_count = all3.groupby([
      "question_user_country_code",
      segmentation,
      time
    ]).size()

    row_sums = gb_sum[niche_collst].sum(axis=1)
    gb_pct = gb_sum[niche_collst].div(row_sums, axis=0)
    gb_se = np.sqrt(gb_pct * (1 - gb_pct)).div(np.sqrt(gb_count), axis=0)

    gb_pct = gb_pct.reset_index()
    gb_se = gb_se.reset_index()
    gb_sum = gb_sum.reset_index()
    gb_sum['count'] = gb_count.values

    gb_pct_long = gb_pct.melt(
      id_vars=["question_user_country_code", segmentation, time],
      value_vars=niche_collst,
      var_name="category",
      value_name="pct"
    )

    gb_se_long = gb_se.melt(
      id_vars=["question_user_country_code", segmentation, time],
      value_vars=niche_collst,
      var_name="category",
      value_name="se"
    )

    gb_long = gb_pct_long.merge(
      gb_se_long,
      on=["question_user_country_code", segmentation, time, "category"]
    ).merge(
      gb_sum[["question_user_country_code", segmentation, time, "count"]],
      on=["question_user_country_code", segmentation, time]
    ).sort_values(
      by=time,
      ascending=True
    )

    gb_long["broad_type"] = gb_long["category"].str.split("_").str[0]
    gb_long["niche"] = gb_long["category"].str.split("_").str[1]

    gb_long["segment_color"] = gb_long[segmentation].map(segment_colors_dct)

    gb_country_segment_n_topic_time_dct[segmentation][time] = gb_long

In [45]:
gb_country_segment_n_topic_time_dct["speed_post_response"]["question_set_dt_year"]

Unnamed: 0,question_user_country_code,speed_post_response,question_set_dt_year,category,pct,se,count,broad_type,niche,segment_color
6485,UG,5,2017,diversification_shift,0.0,0.0,625,diversification,shift,#2E9999
6490,KE,1,2017,financial-inclusion_economy,2.7e-05,9.4e-05,3046,financial-inclusion,economy,#A1A1A1
6496,KE,2,2017,financial-inclusion_economy,0.000205,0.000262,2992,financial-inclusion,economy,#F9C926
3575,KE,1,2017,crop_land,0.007453,0.001558,3046,crop,land,#A1A1A1
3581,KE,2,2017,crop_land,0.007733,0.001601,2992,crop,land,#F9C926
3587,KE,3,2017,crop_land,0.008042,0.00138,4187,crop,land,#DC7326
12,KE,3,2017,financial-inclusion_price,0.005752,0.001169,4187,financial-inclusion,price,#DC7326
18,KE,4,2017,financial-inclusion_price,0.00685,0.002208,1395,financial-inclusion,price,#4897CD
5490,UG,4,2017,diversification_clear,0.001508,0.001359,815,diversification,clear,#4897CD
45,UG,4,2017,financial-inclusion_price,0.007138,0.002949,815,financial-inclusion,price,#4897CD


In [36]:
# gb_country_segment_n_topic_time_dct = defaultdict(dict)
# for segmentation in segmentation_collst:
#   for time in time_collst:
#     gb = all3.groupby([
#       "question_user_country_code",
#       segmentation,
#       time
#     ])[niche_collst].sum().reset_index()

#     row_sums = gb[niche_collst].sum(axis=1)
#     gb[niche_collst] = gb[niche_collst].div(row_sums, axis=0)

#     gb_long = gb.melt(
#       id_vars=["question_user_country_code", segmentation, time],
#       value_vars=gb.columns[2:],
#       var_name="category",
#       value_name="pct"
#     )

#     gb_long["broad_type"] = gb_long["category"].str.split("_").str[0]
#     gb_long["niche"] = gb_long["category"].str.split("_").str[1]

#     gb_long["segment_color"] = gb_long[segmentation].map(segment_colors_dct)

#     gb_country_segment_n_topic_time_dct[segmentation][time] = gb_long

# export data to google drive

In [37]:
alldata_dct = {
  "tab1": vc_country.to_dict(),
  "tab2": gb_segments_dct,
  "tab3": {
    "broad": gb_country_segment_b_topic_dct,
    "niche": gb_country_segment_n_topic_dct
  },
  "tab4": {
    "broad": gb_country_segment_b_topic_time_dct,
    "niche": gb_country_segment_n_topic_time_dct
  }
}

In [38]:
import pickle

with open("all_dashboard_data.pkl", "wb") as w:
  pickle.dump(alldata_dct, w)
!cp all_dashboard_data.pkl /content/drive/MyDrive/DataKit_WeFarm/Dashboard

In [39]:
!ls /content/drive/MyDrive/DataKit_WeFarm/Dashboard

all_dashboard_data.pkl	intersections
