# data import

In [1]:
import pandas as pd

In [2]:
wefarm = pd.read_csv(r"https://producersdirect-backups.s3.eu-west-1.amazonaws.com/athena-results/Unsaved/2025/09/26/b0cd514b-b9cc-4972-a0c2-c91726e6d825.csv")

# peek at the data

In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)

wefarm.head(5)

Unnamed: 0,question_id,question_user_id,question_language,question_content,question_topic,question_sent,response_id,response_user_id,response_language,response_content,response_topic,response_sent,question_user_type,question_user_status,question_user_country_code,question_user_gender,question_user_dob,question_user_created_at,response_user_type,response_user_status,response_user_country_code,response_user_gender,response_user_dob,response_user_created_at
0,3849056,519124,nyn,E ABA WEFARM OFFICES ZABO NIZISHANGWA NKAHI?,,2017-11-22 12:25:03+00,20691011,200868,nyn,E!23 Omubazi Ni Dudu Cipa',,2019-01-24 17:54:06.216221+00,farmer,live,ug,,,2017-11-18 13:09:11+00,farmer,live,ug,,,2017-05-09 09:19:33+00
1,3849061,521327,eng,Q this goes to wefarm. is it possible to get f...,,2017-11-22 12:25:05+00,4334249,526113,eng,Q1 which stage is marleks last vaccinated,,2018-01-04 08:57:28+00,farmer,live,ug,,,2017-11-20 11:55:48+00,farmer,zombie,ug,,,2017-11-22 10:13:03+00
2,3849077,307821,nyn,E ENTE YANJE EZAIRE ENYENA YASHOBERA. \nOBWIRE...,cattle,2017-11-22 12:25:08+00,3849291,296187,nyn,Muhanguzi.Benon kuruga masha isingiro ente yaw...,tomato,2017-11-22 12:35:26+00,farmer,zombie,ug,,,2017-08-22 14:51:07+00,farmer,zombie,ug,,,2017-08-12 09:30:33+00
3,3849077,307821,nyn,E ENTE YANJE EZAIRE ENYENA YASHOBERA. \nOBWIRE...,cattle,2017-11-22 12:25:08+00,3849291,296187,nyn,Muhanguzi.Benon kuruga masha isingiro ente yaw...,cattle,2017-11-22 12:35:26+00,farmer,zombie,ug,,,2017-08-22 14:51:07+00,farmer,zombie,ug,,,2017-08-12 09:30:33+00
4,3849077,307821,nyn,E ENTE YANJE EZAIRE ENYENA YASHOBERA. \nOBWIRE...,cat,2017-11-22 12:25:08+00,3849291,296187,nyn,Muhanguzi.Benon kuruga masha isingiro ente yaw...,tomato,2017-11-22 12:35:26+00,farmer,zombie,ug,,,2017-08-22 14:51:07+00,farmer,zombie,ug,,,2017-08-12 09:30:33+00


In [4]:
wefarm.shape

(20304843, 24)

# install libraries

# create dataframes by language

In [5]:
wefarm_eng_df = wefarm[wefarm["question_language"] == "eng"][["question_id", "question_content"]].drop_duplicates()

In [6]:
doc_lst = wefarm_eng_df["question_content"]

# install preprocessing libraries libraries

In [7]:
!pip install gensim



In [8]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess
from tqdm import tqdm

#preprocessing

In [10]:
lem = WordNetLemmatizer()

def preprocess_text(text):
  tokens = simple_preprocess(text, deacc=True)
  return [lem.lemmatize(word, pos='v') for word in tokens]


In [11]:
processed_questions_eng_lst = []
for question in tqdm(doc_lst):
  processed_questions_eng_lst.append(preprocess_text(question))

100%|██████████| 2940542/2940542 [01:46<00:00, 27591.10it/s]


# keyword extraction

## definition

In [19]:
# keywords manually extracted from previous LDA analysis

market_keywords_lst = [
  "price",
  "sell",
  "market",
  "buy",
  "cost",
  "fee"
]


crop_keywords_lst = [
  "soil",
  "fruit",
  "plant",
  "leave",
  "fertilizer",
  "seed",
  "potatoes",
  "passion",
  "maize",
  "tomatoes",
  "banana",
  "manure",
  "layer",
  "coffee",
  "onions",
  "grow",
  "mulch",
  "bean",
  "type",
  "cabbage",
  "land",
  "variety",
  "yield",
  "keep",
  "rice",
  "harvest",
]

seasonality_keywords_lst = [
  "season",
  "time",
]

disease_keywords_lst = [
  "disease",
  "medicine",
  "treat",
  "weed",
  "control",
  "plant",
  "leave",
  "spray",
  "harvest",
  "affect",
  "prevent",
  "pests",
  "chemical",
  "rabbit",
  "care",
  "tick",
  "attack",
  "mean",
  "black",
  "space",
  "turn",
  "keep"
]

livestock_keywords_lst = [
  "animals",
  "pig",
  "cow",
  "goat",
  "hen",
  "chicken",
  "poultry",
  "lay",
  "egg",
  "milk",
  "breed",
  "dairy",
]

## extrapolation

In [20]:
%%time

from gensim.models import Word2Vec
w2v_model = Word2Vec(processed_questions_eng_lst, min_count=1, vector_size=100)

CPU times: user 3min 12s, sys: 751 ms, total: 3min 13s
Wall time: 1min 9s


In [21]:
def run_w2v_broad(keyword_lst, top_n=50):
  full_keyword_lst = []
  for word in keyword_lst:
    full_keyword_lst.extend([w for w, s in w2v_model.wv.most_similar(word, topn=top_n)])
  return full_keyword_lst

In [22]:
def run_w2v_niche(keyword_lst, top_n=50):
  niche_keyword_dct = {}
  for word in keyword_lst:
    niche_keyword_dct[word] =[w for w, s in w2v_model.wv.most_similar(word, topn=top_n)]
  return niche_keyword_dct

In [23]:
%%time

market_keywords_full_lst = run_w2v_broad(market_keywords_lst)
crop_keywords_full_lst = run_w2v_broad(crop_keywords_lst)
seasonality_keywords_full_lst = run_w2v_broad(seasonality_keywords_lst)
disease_keywords_full_lst = run_w2v_broad(disease_keywords_lst)
livestock_keywords_full_lst = run_w2v_broad(livestock_keywords_lst)

CPU times: user 3.12 s, sys: 5.68 ms, total: 3.12 s
Wall time: 154 ms


In [24]:
%%time

market_keywords_niche_dct = run_w2v_niche(market_keywords_lst)
crop_keywords_niche_dct = run_w2v_niche(crop_keywords_lst)
seasonality_keywords_niche_dct = run_w2v_niche(seasonality_keywords_lst)
disease_keywords_niche_dct = run_w2v_niche(disease_keywords_lst)
livestock_keywords_niche_dct = run_w2v_niche(livestock_keywords_lst)

CPU times: user 2.77 s, sys: 0 ns, total: 2.77 s
Wall time: 116 ms


In [25]:
niche_category_dct = {
  "market": market_keywords_niche_dct,
  "crop": crop_keywords_niche_dct,
  "seasonality": seasonality_keywords_niche_dct,
  "disease": disease_keywords_niche_dct,
  "livestock": livestock_keywords_niche_dct
}

In [26]:
broad_category_dct = {
    "market": market_keywords_full_lst,
    "crop": crop_keywords_full_lst,
    "seasonality": seasonality_keywords_full_lst,
    "disease": disease_keywords_full_lst,
    "livestock": livestock_keywords_full_lst
}

# text categorization

## niche keywords

In [27]:
def find_niche_keywords(doc):
  doc_words = set(doc)
  matches_dct = {}
  for category, sub_dct in niche_category_dct.items():
    for sub_category, keywords in sub_dct.items():
      if any(keyword in doc_words for keyword in keywords):
        matches_dct[f"{category}_{sub_category}"] = 1
  return matches_dct

In [28]:
niche_res_lst = []

for doc in tqdm(processed_questions_eng_lst):
  matches_dct = find_niche_keywords(doc)
  niche_res_lst.append(matches_dct)

100%|██████████| 2940542/2940542 [07:01<00:00, 6980.81it/s]


In [29]:
%%time

niche_res_df = pd.DataFrame(niche_res_lst)

CPU times: user 12.5 s, sys: 1e+03 ms, total: 13.5 s
Wall time: 13.5 s


In [30]:
joined_niche_questions_eng_lst = [" ".join(doc) for doc in processed_questions_eng_lst]
niche_questions_eng_df = pd.DataFrame({
  "preprocessed_question": joined_niche_questions_eng_lst
})

In [31]:
niche_questions_id_eng_df = pd.concat([niche_questions_eng_df.reset_index(drop=True), wefarm_eng_df.reset_index(drop=True)], axis=1)
niche_questions_categories_df = pd.concat([niche_questions_id_eng_df, niche_res_df], axis=1)

In [32]:
niche_questions_categories_df.head(5)

Unnamed: 0,preprocessed_question,question_id,question_content,market_price,market_sell,market_buy,seasonality_time,livestock_animals,livestock_pig,livestock_cow,livestock_goat,livestock_hen,livestock_chicken,crop_layer,livestock_poultry,crop_plant,crop_variety,disease_plant,crop_seed,crop_grow,crop_mulch,crop_harvest,disease_harvest,crop_potatoes,crop_maize,crop_banana,crop_coffee,crop_onions,crop_bean,crop_cabbage,crop_land,crop_tomatoes,disease_disease,disease_care,crop_keep,disease_keep,market_market,market_cost,crop_rice,disease_rabbit,disease_treat,disease_prevent,disease_chemical,crop_fruit,livestock_milk,crop_leave,crop_passion,disease_leave,livestock_egg,livestock_breed,crop_fertilizer,crop_manure,seasonality_season,disease_control,crop_yield,livestock_dairy,crop_soil,disease_spray,disease_turn,disease_attack,market_fee,crop_type,disease_medicine,disease_tick,disease_pests,livestock_lay,disease_weed,disease_black,disease_space,disease_affect,disease_mean
0,this go to wefarm be it possible to get for us...,3849061,Q this goes to wefarm. is it possible to get f...,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,have stock rabbit urine for weeks mashambani s...,3849084,Q-i have stock rabbit's urine for 5 weeks mash...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,have mi can start aproject of poutry how can d...,3849098,Q J Have Mi 10000 Can J Start Aproject Of Pout...,,,,,,,,,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,where do get seed of coconut,3849100,WHERE DO I GET SEEDS OF COCONUT?,,1.0,1.0,,,,,,,,,,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,which plant have omega,3849129,Q#.Which plant has omega3?,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [33]:
keyword_collst = niche_questions_categories_df.columns[3:]
keyword_sums = niche_questions_categories_df[keyword_collst].sum().sort_values()
sums_df = keyword_sums.reset_index()
sums_df.columns = ["col", "sum"]

sums_df["prefix"] = [c.split("_")[0] for c in sums_df["col"]]

sums_df = sums_df.sort_values(["prefix", "sum"], ascending=[True, False])

keyword_sums_dct = dict(zip(sums_df["col"], sums_df["sum"]))

In [34]:
import plotly.graph_objects as go

fig = go.Figure()

for k, v in keyword_sums_dct.items():
  fig.add_trace(
    go.Bar(
      x=[k],
      y=[v],
      marker={
        "color": [v],
        "colorscale": "greens"
      },
      name=k,
    )
  )

fig

In [35]:
# from google.colab import drive
# drive.mount('/content/drive')

In [36]:
# niche_questions_categories_df.to_parquet("en_questions_cat_niche.parquet", index=False)
# !cp en_questions_cat_niche.parquet /content/drive/MyDrive/DataKit_WeFarm

## broad keywords

In [37]:
def find_keywords(doc):
  doc_words = set(doc)
  matches_dct = {}
  for category, keywords in broad_category_dct.items():
    if any(keyword in doc_words for keyword in keywords):
      matches_dct[category] = 1
  return matches_dct

In [38]:
broad_res_lst = []

for doc in tqdm(processed_questions_eng_lst):
  matches_dct = find_keywords(doc)
  broad_res_lst.append(matches_dct)

100%|██████████| 2940542/2940542 [03:46<00:00, 12979.67it/s]


In [39]:
broad_res_df = pd.DataFrame(broad_res_lst)

In [40]:
broad_joined_questions_eng_lst = [" ".join(doc) for doc in processed_questions_eng_lst]
broad_questions_eng_df = pd.DataFrame({
  "preprocessed_question": broad_joined_questions_eng_lst
})

In [41]:
broad_questions_id_eng_df = pd.concat([broad_questions_eng_df.reset_index(drop=True), wefarm_eng_df.reset_index(drop=True)], axis=1)
broad_questions_categories_df = pd.concat([broad_questions_id_eng_df, broad_res_df], axis=1)

In [42]:
broad_questions_categories_df.head(5)

Unnamed: 0,preprocessed_question,question_id,question_content,market,seasonality,livestock,crop,disease
0,this go to wefarm be it possible to get for us...,3849061,Q this goes to wefarm. is it possible to get f...,1.0,,,,
1,have stock rabbit urine for weeks mashambani s...,3849084,Q-i have stock rabbit's urine for 5 weeks mash...,1.0,1.0,1.0,,
2,have mi can start aproject of poutry how can d...,3849098,Q J Have Mi 10000 Can J Start Aproject Of Pout...,,,1.0,1.0,
3,where do get seed of coconut,3849100,WHERE DO I GET SEEDS OF COCONUT?,1.0,,,1.0,1.0
4,which plant have omega,3849129,Q#.Which plant has omega3?,,,,1.0,1.0


In [43]:
sum_market = broad_questions_categories_df["market"].sum()
sum_livestock = broad_questions_categories_df["livestock"].sum()
sum_crop = broad_questions_categories_df["crop"].sum()
sum_disease = broad_questions_categories_df["disease"].sum()
sum_seasonality = broad_questions_categories_df["seasonality"].sum()

In [44]:
count_dct = {
  "market": sum_market,
  "livestock": sum_livestock,
  "crop": sum_crop,
  "disease": sum_disease,
  "seasonality": sum_seasonality,
  "nan": broad_questions_categories_df[["market", "disease", "crop", "seasonality", "livestock"]].isnull().all(axis=1).sum()
}

In [45]:
import plotly.graph_objects as go

fig = go.Figure()

for k, v in count_dct.items():
  fig.add_trace(
    go.Bar(
      x=[k],
      y=[v],
      marker={
        "color": [v],
        "colorscale": "greens"
      },
      name=k,
    )
  )

In [50]:
fig.update_layout({
  "title": "Reduced nulls from 1.6 million to 0.25 million"
})

In [47]:
from plotly.subplots import make_subplots

bigfig = make_subplots(
  1, 6,
  specs=[[{'type': 'pie'}] * 6]
  )

for idx, (k, v) in enumerate(count_dct.items()):
  bigfig.add_trace(
    go.Pie(
      labels=[k, "not"],
      values=[v, len(broad_questions_categories_df) - v],
      marker={"colors": ["#D742A8", "#E7E7E7"]}
    ),
    row=1, col=idx+1
  )

bigfig.update_layout({
  "title": 'The majority of all questions concern "crops" and/or "disease"'
})

In [48]:
# broad_questions_categories_df.to_parquet("en_questions_cat_broad.parquet", index=False)
# !cp en_questions_cat_broad.parquet /content/drive/MyDrive/DataKit_WeFarm

In [49]:
# broad_questions_categories_df[["market", "disease", "crop", "seasonality", "livestock"]].isnull().all(axis=1).sum()