In [24]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
wefarm = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/wefarm.parquet")

In [4]:
niche_questions_categories_df = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/en_questions_cat_niche.parquet")
broad_questions_categories_df = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/en_questions_cat_broad.parquet")

In [120]:
all = pd.merge(wefarm, niche_questions_categories_df, on="question_id").merge(
  broad_questions_categories_df, on="question_id"
)

In [121]:
all["response_sent_dt"] = pd.to_datetime(all["response_sent"], format="ISO8601")
all["question_set_dt"] = pd.to_datetime(all["question_sent"], format="ISO8601")

In [122]:
def parse_datetime_col(df, colstr):
  df[f'{colstr}_year'] = df[colstr].dt.year
  df[f'{colstr}_month'] = df[colstr].dt.month
  df[f'{colstr}_day'] = df[colstr].dt.day
  df[f'{colstr}_hour'] = df[colstr].dt.hour
  df[f'{colstr}_minute'] = df[colstr].dt.minute
  df[f'{colstr}_second'] = df[colstr].dt.second
  df[f'{colstr}_day_name'] = df[colstr].dt.day_name()
  df[f'{colstr}_day_of_week'] = df[colstr].dt.day_of_week
  df[f'{colstr}_month_name'] = df[colstr].dt.month_name()
  df[f'{colstr}_quarter'] = df[colstr].dt.quarter
  df[f'{colstr}_week'] = df[colstr].dt.isocalendar().week
  df[f'{colstr}_date'] = df[colstr].dt.date
  df[f'{colstr}_time'] = df[colstr].dt.time

  return df

In [123]:
all = parse_datetime_col(all, "response_sent_dt")
all = parse_datetime_col(all, "question_set_dt")

In [124]:
gb = all.groupby(["response_user_id"]
  ).agg({"response_id": "count"}
  ).sort_values(by=["response_id"], ascending=False)
leaders_user_id_lst = gb[:20].index


In [125]:
leaders = all[all["response_user_id"].isin(leaders_user_id_lst)]
nonleaders = all[~all["response_user_id"].isin(leaders_user_id_lst)]

In [126]:
leaders1 = leaders.drop_duplicates(subset=["question_id"])
nonleaders1 = nonleaders.drop_duplicates(subset=["question_id"])

In [127]:
leaders_ke = leaders1[leaders1["question_user_country_code"]=="ke"]
nonleaders_ke = nonleaders1[nonleaders1["question_user_country_code"]=="ke"]

leaders_ug = leaders1[leaders1["question_user_country_code"]=="ug"]
nonleaders_ug = nonleaders1[nonleaders1["question_user_country_code"]=="ug"]

In [128]:
def groupby_pct(df0, df1, time_colstr, agg_colstr):

  gb0 = df0.groupby(time_colstr).agg({
    agg_colstr: "sum"
  }).rename(columns={
    agg_colstr: f"{agg_colstr}_leaders"
  })
  gb0 = gb0 / gb0.sum()

  gb1 = df1.groupby(time_colstr).agg({
    agg_colstr: "sum"
  }).rename(columns={
    agg_colstr: f"{agg_colstr}_nonleaders"
  })
  gb1 = gb1 / gb1.sum()

  return pd.concat([gb0, gb1], axis=1)

In [151]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from itertools import chain

def time_scatters_broad(time_colstr):

  cat_lst = ["livestock", "seasonality", "disease", "market", "crop"]
  color_lst = ["#74C476", "#F4D342"]

  bigfig = make_subplots(
    5, 2,
    subplot_titles=list(chain.from_iterable([[j]*2 for j in cat_lst]))
  )

  row, col = 1, 1
  for idx, cat in enumerate(cat_lst):
    ug_gb = groupby_pct(leaders_ug, nonleaders_ug, time_colstr, cat)
    ke_gb = groupby_pct(leaders_ke, nonleaders_ke, time_colstr, cat)

    bigfig.add_trace(
      go.Scatter(
        x=ug_gb.index,
        y=ug_gb[f"{cat}_leaders"],
        mode="lines",
        marker={
          "color": color_lst[0]
        },
        name="ug_leaders"
      ), row=idx+1, col=1
    ).add_trace(
      go.Scatter(
        x=ug_gb.index,
        y=ug_gb[f"{cat}_nonleaders"],
        mode="lines",
        marker={
          "color": color_lst[1]
        },
        name="ug_nonleaders"
      ), row=idx+1, col=1
    ).add_trace(
      go.Scatter(
        x=ke_gb.index,
        y=ke_gb[f"{cat}_leaders"],
        mode="lines",
        marker={
          "color": color_lst[0]
        },
        name="ke_leaders"
      ), row=idx+1, col=2
    ).add_trace(
      go.Scatter(
        x=ke_gb.index,
        y=ke_gb[f"{cat}_nonleaders"],
        mode="lines",
        marker={
          "color": color_lst[1]
        },
        name="ke_nonleaders"
      ), row=idx+1, col=2
    )

  return bigfig.update_layout({
    "title": f"time slice: {time_colstr}",
    "height": 1200
  }).update_yaxes(matches="y")


In [152]:
time_scatters_broad("question_set_dt_year")

In [153]:
time_scatters_broad("question_set_dt_month")

In [154]:
time_scatters_broad("question_set_dt_day_of_week")

In [155]:
time_scatters_broad("question_set_dt_hour")

In [161]:
leaders_ke.columns[:5,26:94]

Unnamed: 0,market_price,market_sell,market_buy,seasonality_time,livestock_animals,livestock_pig,livestock_cow,livestock_goat,livestock_hen,livestock_chicken,livestock_poultry,crop_layer,crop_plant,crop_variety,crop_harvest,disease_plant,disease_harvest,crop_seed,crop_grow,crop_mulch,crop_potatoes,crop_passion,crop_maize,crop_banana,crop_coffee,crop_onions,crop_bean,crop_cabbage,crop_land,disease_spray,disease_rabbit,crop_tomatoes,crop_rice,disease_disease,disease_care,crop_keep,disease_keep,market_market,market_cost,disease_treat,disease_prevent,disease_chemical,crop_fruit,livestock_milk,livestock_egg,livestock_breed,crop_fertilizer,crop_manure,seasonality_season,disease_control,crop_yield,livestock_dairy,crop_soil,disease_turn,disease_attack,market_fee,crop_type,crop_leave,disease_medicine,disease_leave,disease_tick,disease_weed,disease_pests,livestock_lay,disease_black,disease_space,disease_affect,disease_mean
2604,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4518,,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
27853,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
29373,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
44246,,,,,1.0,,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,


In [165]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from itertools import chain

def time_scatters_niche(time_colstr):

  cat_lst = leaders_ke.columns[26:94]
  color_lst = ["#74C476", "#F4D342"]

  bigfig = make_subplots(
    68, 2,
    subplot_titles=list(chain.from_iterable([[j]*2 for j in cat_lst]))
  )

  row, col = 1, 1
  for idx, cat in enumerate(cat_lst):
    ug_gb = groupby_pct(leaders_ug, nonleaders_ug, time_colstr, cat)
    ke_gb = groupby_pct(leaders_ke, nonleaders_ke, time_colstr, cat)

    bigfig.add_trace(
      go.Scatter(
        x=ug_gb.index,
        y=ug_gb[f"{cat}_leaders"],
        mode="lines",
        marker={
          "color": color_lst[0]
        },
        name="ug_leaders"
      ), row=idx+1, col=1
    ).add_trace(
      go.Scatter(
        x=ug_gb.index,
        y=ug_gb[f"{cat}_nonleaders"],
        mode="lines",
        marker={
          "color": color_lst[1]
        },
        name="ug_nonleaders"
      ), row=idx+1, col=1
    ).add_trace(
      go.Scatter(
        x=ke_gb.index,
        y=ke_gb[f"{cat}_leaders"],
        mode="lines",
        marker={
          "color": color_lst[0]
        },
        name="ke_leaders"
      ), row=idx+1, col=2
    ).add_trace(
      go.Scatter(
        x=ke_gb.index,
        y=ke_gb[f"{cat}_nonleaders"],
        mode="lines",
        marker={
          "color": color_lst[1]
        },
        name="ke_nonleaders"
      ), row=idx+1, col=2
    )

  return bigfig.update_layout({
    "title": f"time slice: {time_colstr}",
    "height": 1200
  }).update_yaxes(matches="y")


In [170]:
# uganda: specific months for specific crops
# uganda: also some upswing among leaders in "livestock_animals" and "livestock_dairy" questions towards the end of the year

time_scatters_niche("question_set_dt_month").update_layout({
  "height": 7000
})

In [171]:
# uganda: certain livestock topics seemed to be mentioned in later years by leaders more

time_scatters_niche("question_set_dt_year").update_layout({
  "height": 7000
})