In [1]:
# 3. Imports
import json
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df = pd.read_csv('Kenya_Agri_full_dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,query_text,topic_label,language,region,timestamp,target_response
0,1,Can you give some tips on maintaining cleanlin...,livestock,swahili,,2024-11-25 08:37:28,Hi <NAME>! 🌼 I can help you with that! Here ar...
1,2,What are the important steps to take when the ...,weather,english,,2024-11-25 08:47:28,Seems like you're looking for information on h...
2,3,Can you give some tips on keeping the herd cle...,other,english,,2024-11-25 08:57:28,To maintain a clean flock and prevent diseases...
3,4,Why is toxic binder used in Livestock feeds,livestock,english,,2024-11-25 09:07:28,"Toxic binders, also known as mycotoxin binders..."
4,5,Is maize silage the best feed for dairy cows,livestock,english,,2024-11-25 09:17:28,"To maximize milk production in dairy cows, the..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23620 entries, 0 to 23619
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               23620 non-null  int64  
 1   query_text       23620 non-null  object 
 2   topic_label      23620 non-null  object 
 3   language         23620 non-null  object 
 4   region           0 non-null      float64
 5   timestamp        23620 non-null  object 
 6   target_response  23620 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.3+ MB


In [5]:
df = df[['query_text','target_response']]

In [6]:
df.head()

Unnamed: 0,query_text,target_response
0,Can you give some tips on maintaining cleanlin...,Hi <NAME>! 🌼 I can help you with that! Here ar...
1,What are the important steps to take when the ...,Seems like you're looking for information on h...
2,Can you give some tips on keeping the herd cle...,To maintain a clean flock and prevent diseases...
3,Why is toxic binder used in Livestock feeds,"Toxic binders, also known as mycotoxin binders..."
4,Is maize silage the best feed for dairy cows,"To maximize milk production in dairy cows, the..."


In [7]:
df.tail()

Unnamed: 0,query_text,target_response
23615,What are some cultural practices that can help...,Hi <NAME>! 👋\n\nTo prevent pest infestations o...
23616,"1, what disease that makes part of the mango t...",Seems like that particular topic wasn't in my ...
23617,How can integrated pest management help in eff...,Integrated Pest Management (IPM) is a great st...
23618,Can you explain the role of mulching in pest c...,Hi <NAME>! 🌿\n\nMulching plays a crucial role ...
23619,How important is soil drainage for banana cult...,Hi <NAME>! Soil drainage is crucial for banana...


In [8]:
df.shape

(23620, 2)

In [9]:
# 2. Inspect
print("Initial shape:", df.shape)
print("Nulls per column:\n", df.isna().sum())

Initial shape: (23620, 2)
Nulls per column:
 query_text         0
target_response    0
dtype: int64


In [10]:
# 3. Drop nulls
df = df.dropna(subset=['query_text', 'target_response']).reset_index(drop=True)
print("After dropping nulls:", df.shape)

After dropping nulls: (23620, 2)


In [11]:
df.duplicated().sum()

np.int64(206)

In [12]:
# 4. Drop exact duplicates
before = df.shape[0]
df = df.drop_duplicates(subset=['query_text', 'target_response']).reset_index(drop=True)
print(f"Dropped {before - df.shape[0]} duplicate rows; new shape:", df.shape)

Dropped 206 duplicate rows; new shape: (23414, 2)


In [13]:
# 5. Text‐normalization function
def clean_text(text):
    # strip HTML
    text = BeautifulSoup(text, "lxml").get_text(separator=" ")
    # lowercase
    text = text.lower()
    # remove non‑alphabetic (but keep numbers, % and common ag tokens)
    text = re.sub(r"[^a-z0-9%°µph\s]", " ", text)
    # collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [14]:
# 6. Apply cleaning
df['query_clean']    = df['query_text'].apply(clean_text)
df['response_clean'] = df['target_response'].apply(clean_text)

# 7. Peek
print(df[['query_clean','response_clean']].head(5))

                                         query_clean  \
0  can you give some tips on maintaining cleanlin...   
1  what are the important steps to take when the ...   
2  can you give some tips on keeping the herd cle...   
3        why is toxic binder used in livestock feeds   
4       is maize silage the best feed for dairy cows   

                                      response_clean  
0  hi i can help you with that here are some tips...  
1  seems like you re looking for information on h...  
2  to maintain a clean flock and prevent diseases...  
3  toxic binders also known as mycotoxin binders ...  
4  to maximize milk production in dairy cows the ...  


In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
# --- 1) Split into train/val (90/10) ---
texts     = df['query_clean'].tolist()
responses = df['response_clean'].tolist()

train_texts, val_texts, train_responses, val_responses = train_test_split(
    texts, responses, test_size=0.1, random_state=42)

print("Train pairs:", len(train_texts))
print("Val pairs:  ", len(val_texts))

Train pairs: 21072
Val pairs:   2342


In [17]:
# --- 2) Build tokenizer on train set ---
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts + train_responses)

# Optional: inspect vocab size
print("Actual vocab size:", min(MAX_VOCAB_SIZE, len(tokenizer.word_index)+1))


Actual vocab size: 14693


In [18]:
# --- 3) Convert to integer sequences ---
train_q_seq = tokenizer.texts_to_sequences(train_texts)
val_q_seq   = tokenizer.texts_to_sequences(val_texts)

train_r_seq = tokenizer.texts_to_sequences(train_responses)
val_r_seq   = tokenizer.texts_to_sequences(val_responses)

In [19]:
# --- 4) Choose max lengths (you can adjust these) ---
# Here we use 95th‑percentile lengths:
all_q_lens = [len(s) for s in train_q_seq]
all_r_lens = [len(s) for s in train_r_seq]

max_q_len = int(np.percentile(all_q_lens, 95))
max_r_len = int(np.percentile(all_r_lens, 95))
print(f"95th pct query len = {max_q_len}, response len = {max_r_len}")


95th pct query len = 18, response len = 226


In [20]:
# --- 5) Pad/truncate sequences ---
train_q = pad_sequences(train_q_seq, maxlen=max_q_len, padding='post', truncating='post')
val_q   = pad_sequences(val_q_seq,   maxlen=max_q_len, padding='post', truncating='post')

train_r = pad_sequences(train_r_seq, maxlen=max_r_len, padding='post', truncating='post')
val_r   = pad_sequences(val_r_seq,   maxlen=max_r_len, padding='post', truncating='post')

print("train_q shape:", train_q.shape)
print("train_r shape:", train_r.shape)
print("val_q   shape:", val_q.shape)
print("val_r   shape:", val_r.shape)

train_q shape: (21072, 18)
train_r shape: (21072, 226)
val_q   shape: (2342, 18)
val_r   shape: (2342, 226)


In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
df.head()

Unnamed: 0,query_text,target_response,query_clean,response_clean
0,Can you give some tips on maintaining cleanlin...,Hi <NAME>! 🌼 I can help you with that! Here ar...,can you give some tips on maintaining cleanlin...,hi i can help you with that here are some tips...
1,What are the important steps to take when the ...,Seems like you're looking for information on h...,what are the important steps to take when the ...,seems like you re looking for information on h...
2,Can you give some tips on keeping the herd cle...,To maintain a clean flock and prevent diseases...,can you give some tips on keeping the herd cle...,to maintain a clean flock and prevent diseases...
3,Why is toxic binder used in Livestock feeds,"Toxic binders, also known as mycotoxin binders...",why is toxic binder used in livestock feeds,toxic binders also known as mycotoxin binders ...
4,Is maize silage the best feed for dairy cows,"To maximize milk production in dairy cows, the...",is maize silage the best feed for dairy cows,to maximize milk production in dairy cows the ...


In [23]:
# 1) Add start/end markers
df['response_clean'] = df['response_clean'].apply(
    lambda txt: '<start> ' + txt + ' <end>'
)

In [24]:
df.head()

Unnamed: 0,query_text,target_response,query_clean,response_clean
0,Can you give some tips on maintaining cleanlin...,Hi <NAME>! 🌼 I can help you with that! Here ar...,can you give some tips on maintaining cleanlin...,<start> hi i can help you with that here are s...
1,What are the important steps to take when the ...,Seems like you're looking for information on h...,what are the important steps to take when the ...,<start> seems like you re looking for informat...
2,Can you give some tips on keeping the herd cle...,To maintain a clean flock and prevent diseases...,can you give some tips on keeping the herd cle...,<start> to maintain a clean flock and prevent ...
3,Why is toxic binder used in Livestock feeds,"Toxic binders, also known as mycotoxin binders...",why is toxic binder used in livestock feeds,<start> toxic binders also known as mycotoxin ...
4,Is maize silage the best feed for dairy cows,"To maximize milk production in dairy cows, the...",is maize silage the best feed for dairy cows,<start> to maximize milk production in dairy c...


In [25]:
# 2) Split again (we want markers in train/val)
texts     = df['query_clean'].tolist()
responses = df['response_clean'].tolist()
train_texts, val_texts, train_resps, val_resps = train_test_split(
    texts, responses, test_size=0.1, random_state=42)

In [26]:
# 3) Re‑build tokenizer on train only
MAX_VOCAB_SIZE = 20000
tok = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tok.fit_on_texts(train_texts + train_resps)
print("Vocab size:", min(MAX_VOCAB_SIZE, len(tok.word_index)+1))

Vocab size: 14693


In [27]:
# 4) Text → int sequences
train_q_seq = tok.texts_to_sequences(train_texts)
val_q_seq   = tok.texts_to_sequences(val_texts)
train_r_seq = tok.texts_to_sequences(train_resps)
val_r_seq   = tok.texts_to_sequences(val_resps)

In [28]:
# 5) Determine 95th‑percentile lengths (again, after adding tokens)
import pandas as pd
all_q_lens = [len(s) for s in train_q_seq]
all_r_lens = [len(s) for s in train_r_seq]
max_q_len = int(pd.Series(all_q_lens).quantile(0.95))
max_r_len = int(pd.Series(all_r_lens).quantile(0.95))
print(f"Max Q len = {max_q_len}, Max R len = {max_r_len}")

Max Q len = 18, Max R len = 228


In [29]:
# 6) Pad queries
train_q = pad_sequences(train_q_seq, maxlen=max_q_len, padding='post')
val_q   = pad_sequences(val_q_seq,   maxlen=max_q_len, padding='post')

In [30]:
# 7) Prepare decoder input & target sequences
#    decoder_input is the full sequence except the last token;
#    decoder_target is the full sequence except the first token.
def make_decoder_data(seqs, maxlen):
    inp = []
    tgt = []
    for s in seqs:
        # pad/truncate first so that shifts align
        s_pad = pad_sequences([s], maxlen=maxlen, padding='post')[0]
        inp.append(s_pad[:-1])
        tgt.append(s_pad[1:])
    return np.array(inp), np.array(tgt)

train_decoder_in,  train_decoder_tgt  = make_decoder_data(train_r_seq, max_r_len)
val_decoder_in,    val_decoder_tgt    = make_decoder_data(val_r_seq,   max_r_len)


In [31]:
# 8) Inspect shapes
print("train_q         ", train_q.shape)
print("train_decoder_in", train_decoder_in.shape)
print("train_decoder_tgt",train_decoder_tgt.shape)
print("val_q           ", val_q.shape)
print("val_decoder_in  ", val_decoder_in.shape)
print("val_decoder_tgt ",val_decoder_tgt.shape)

train_q          (21072, 18)
train_decoder_in (21072, 227)
train_decoder_tgt (21072, 227)
val_q            (2342, 18)
val_decoder_in   (2342, 227)
val_decoder_tgt  (2342, 227)


In [32]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, AdditiveAttention, Concatenate
from tensorflow.keras.models import Model

In [33]:
# — Parameters —
vocab_size     = min(MAX_VOCAB_SIZE, len(tok.word_index) + 1)  # e.g. 14693 + 1
embedding_dim  = 256
lstm_units     = 512
encoder_maxlen = max_q_len      # 18 from before
decoder_maxlen = max_r_len - 1  # decoder_in length (228 - 1 = 227)

# — Encoder —
encoder_inputs = Input(shape=(encoder_maxlen,), name="encoder_inputs")
enc_emb        = Embedding(vocab_size, embedding_dim,
                           mask_zero=True,
                           name="encoder_embedding")(encoder_inputs)
encoder_lstm   = LSTM(lstm_units,
                      return_sequences=True,
                      return_state=True,
                      name="encoder_lstm")
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

In [34]:
# — Decoder —
decoder_inputs = Input(shape=(decoder_maxlen,), name="decoder_inputs")
dec_emb_layer  = Embedding(vocab_size, embedding_dim,
                           mask_zero=True,
                           name="decoder_embedding")
dec_emb        = dec_emb_layer(decoder_inputs)
decoder_lstm   = LSTM(lstm_units,
                      return_sequences=True,
                      return_state=True,
                      name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=[state_h, state_c])


In [35]:
# — Attention —
attn_layer   = AdditiveAttention(name="attention_layer")
context_vec  = attn_layer([decoder_outputs, encoder_outputs])
# combine context + decoder LSTM outputs
decoder_concat = Concatenate(axis=-1, name="concat_layer")([context_vec, decoder_outputs])


In [36]:
# — Final projection —
decoder_dense = Dense(vocab_size, activation="softmax", name="output_layer")
decoder_preds = decoder_dense(decoder_concat)


In [37]:
# — Define & compile model —
model = Model([encoder_inputs, decoder_inputs], decoder_preds)
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

In [38]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [39]:
# — Callbacks —
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='best_agri_seq2seq.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
]


In [40]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, LSTM,
    Attention, Concatenate, Dense
)
from tensorflow.keras.models import Model

# — Hyperparameters —
vocab_size     = min(MAX_VOCAB_SIZE, len(tok.word_index) + 1)
embedding_dim  = 256
lstm_units     = 512
encoder_maxlen = max_q_len
decoder_maxlen = max_r_len - 1

# — Inputs —
encoder_inputs = Input(shape=(encoder_maxlen,), name="encoder_inputs")
decoder_inputs = Input(shape=(decoder_maxlen,), name="decoder_inputs")

# — Embeddings (no mask_zero) —
enc_emb = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    name="encoder_embedding"
)(encoder_inputs)

dec_emb = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    name="decoder_embedding"
)(decoder_inputs)

# — Encoder LSTM —
encoder_lstm = LSTM(
    lstm_units,
    return_sequences=True,
    return_state=True,
    name="encoder_lstm"
)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# — Decoder LSTM —
decoder_lstm = LSTM(
    lstm_units,
    return_sequences=True,
    return_state=True,
    name="decoder_lstm"
)
decoder_outputs, _, _ = decoder_lstm(
    dec_emb,
    initial_state=[state_h, state_c]
)

# — Dot‑product Attention —
attn_out = Attention(name="attention_layer")(
    [decoder_outputs, encoder_outputs]
)

# — Concat & Final Dense —
decoder_concat = Concatenate(name="concat_layer")(
    [attn_out, decoder_outputs]
)
decoder_preds = Dense(
    vocab_size,
    activation="softmax",
    name="output_layer"
)(decoder_concat)

# — Build & Compile —
model = Model([encoder_inputs, decoder_inputs], decoder_preds)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [41]:
history = model.fit(
    [train_q, train_decoder_in],
    train_decoder_tgt[..., np.newaxis],
    validation_data=(
        [val_q, val_decoder_in],
        val_decoder_tgt[..., np.newaxis]
    ),
    batch_size=64,
    epochs=10,
    callbacks=callbacks
)

Epoch 1/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 716ms/step - accuracy: 0.4545 - loss: 4.0713
Epoch 1: val_loss improved from inf to 2.57685, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 763ms/step - accuracy: 0.4547 - loss: 4.0690 - val_accuracy: 0.5938 - val_loss: 2.5769
Epoch 2/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 733ms/step - accuracy: 0.6016 - loss: 2.4629
Epoch 2: val_loss improved from 2.57685 to 2.00556, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 777ms/step - accuracy: 0.6016 - loss: 2.4625 - val_accuracy: 0.6525 - val_loss: 2.0056
Epoch 3/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734ms/step - accuracy: 0.6593 - loss: 1.9111
Epoch 3: val_loss improved from 2.00556 to 1.66111, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 784ms/step - accuracy: 0.6594 - loss: 1.9108 - val_accuracy: 0.6936 - val_loss: 1.6611
Epoch 4/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 735ms/step - accuracy: 0.6988 - loss: 1.5862
Epoch 4: val_loss improved from 1.66111 to 1.47490, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 783ms/step - accuracy: 0.6989 - loss: 1.5860 - val_accuracy: 0.7182 - val_loss: 1.4749
Epoch 5/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 735ms/step - accuracy: 0.7259 - loss: 1.3774
Epoch 5: val_loss improved from 1.47490 to 1.35772, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 783ms/step - accuracy: 0.7259 - loss: 1.3774 - val_accuracy: 0.7351 - val_loss: 1.3577
Epoch 6/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734ms/step - accuracy: 0.7420 - loss: 1.2570
Epoch 6: val_loss improved from 1.35772 to 1.27766, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 779ms/step - accuracy: 0.7420 - loss: 1.2569 - val_accuracy: 0.7463 - val_loss: 1.2777
Epoch 7/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734ms/step - accuracy: 0.7578 - loss: 1.1473
Epoch 7: val_loss improved from 1.27766 to 1.22374, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 786ms/step - accuracy: 0.7578 - loss: 1.1473 - val_accuracy: 0.7545 - val_loss: 1.2237
Epoch 8/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734ms/step - accuracy: 0.7690 - loss: 1.0688
Epoch 8: val_loss improved from 1.22374 to 1.17924, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 776ms/step - accuracy: 0.7690 - loss: 1.0688 - val_accuracy: 0.7620 - val_loss: 1.1792
Epoch 9/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734ms/step - accuracy: 0.7813 - loss: 0.9935
Epoch 9: val_loss improved from 1.17924 to 1.14934, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 784ms/step - accuracy: 0.7813 - loss: 0.9936 - val_accuracy: 0.7672 - val_loss: 1.1493
Epoch 10/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734ms/step - accuracy: 0.7899 - loss: 0.9397
Epoch 10: val_loss improved from 1.14934 to 1.12228, saving model to best_agri_seq2seq.h5




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 786ms/step - accuracy: 0.7899 - loss: 0.9397 - val_accuracy: 0.7720 - val_loss: 1.1223
Restoring model weights from the end of the best epoch: 10.


In [42]:
# saves both architecture + weights + optimizer state
model.save('best_agri_seq2seq.keras')

### Reconstruct encoder & decoder for inference

In [43]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, LSTM, Embedding, Attention, Concatenate, Dense
from tensorflow.keras import Model

In [44]:
# — Reload full model (for weights) —
full_model = load_model('best_agri_seq2seq.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [45]:
# 2) Extract dims from the full model
vocab_size     = full_model.output_shape[-1]               # (None, 227, vocab_size)
encoder_maxlen = full_model.input[0].shape[1]              # 18
decoder_maxlen = full_model.input[1].shape[1]              # 227
embedding_dim  = full_model.get_layer('encoder_embedding').output.shape[-1]
lstm_units     = full_model.get_layer('encoder_lstm').units

print(f"vocab_size={vocab_size}, emb_dim={embedding_dim}, lstm_units={lstm_units}")
print(f"encoder_maxlen={encoder_maxlen}, decoder_maxlen={decoder_maxlen}")

vocab_size=14693, emb_dim=256, lstm_units=512
encoder_maxlen=18, decoder_maxlen=227


In [46]:
# 3) Rebuild the inference encoder
enc_inputs = Input(shape=(encoder_maxlen,), name='enc_inputs')
enc_emb    = full_model.get_layer('encoder_embedding')(enc_inputs)
enc_outs, st_h, st_c = full_model.get_layer('encoder_lstm')(enc_emb)
encoder_model = Model(enc_inputs, [enc_outs, st_h, st_c])

In [47]:
# 4) Rebuild the inference decoder
from tensorflow.keras.layers import Attention, Concatenate

dec_inputs   = Input(shape=(decoder_maxlen,), name='dec_inputs')
dec_h_in     = Input(shape=(lstm_units,), name='dec_h_in')
dec_c_in     = Input(shape=(lstm_units,), name='dec_c_in')
enc_outs_inf = Input(shape=(encoder_maxlen, lstm_units), name='enc_outs_inf')

dec_emb      = full_model.get_layer('decoder_embedding')(dec_inputs)
dec_lstm     = full_model.get_layer('decoder_lstm')
dec_outs, dh, dc = dec_lstm(dec_emb, initial_state=[dec_h_in, dec_c_in])

attn_out     = full_model.get_layer('attention_layer')([dec_outs, enc_outs_inf])
dec_concat   = full_model.get_layer('concat_layer')([attn_out, dec_outs])
dec_preds    = full_model.get_layer('output_layer')(dec_concat)

decoder_model = Model(
    [dec_inputs, dec_h_in, dec_c_in, enc_outs_inf],
    [dec_preds, dh, dc]
)

print("Inference models built successfully.")

Inference models built successfully.


### implement greedy decoding and test a few examples.

In [48]:
import numpy as np

# Figure out what the tokenizer actually stored
start_key = '<start>' if '<start>' in tok.word_index else 'start'
end_key   = '<end>'   if '<end>'   in tok.word_index else 'end'
start_token = tok.word_index[start_key]
end_token   = tok.word_index[end_key]
print(f"Using start_token={start_token} ({start_key}), end_token={end_token} ({end_key})")

def decode_sequence(input_seq):
    # 1) Encode
    enc_outs, h, c = encoder_model.predict(input_seq, verbose=0)

    # 2) Seed with the start token
    target_seq = np.array([[start_token]])
    decoded = []

    for _ in range(decoder_maxlen):
        preds, h, c = decoder_model.predict([target_seq, h, c, enc_outs], verbose=0)
        sampled = np.argmax(preds[0, -1, :])
        if sampled == end_token:
            break
        decoded.append(tok.index_word.get(sampled, '<UNK>'))
        target_seq = np.array([[sampled]])

    return ' '.join(decoded)


Using start_token=17 (start), end_token=18 (end)


In [49]:
# Rerun a few samples
for idx in [5, 25, 100]:
    inp = val_q[idx:idx+1]
    print("Q   :", val_texts[idx])
    print("Pred:", decode_sequence(inp))
    print("GT  :", val_resps[idx], "\n")

Q   : maize
Pred: q1 what are some common pests and diseases that affect coffee plants a1 antestia bugs can be used to control pests in mango farming q2 what are the critical factors for successful promotion of banana farming a2 the use of resistant varieties for mango farming include single super phosphate ssp di ammonium phosphate and triple super phosphate tsp and phosphoric phosphorous acid the choice of a phosphatic fertilizer depends on the soil reaction ph q3 how can farmers ensure the quality of coffee seedlings a3 the right amount of inputs depends on the soil reaction ph q3 what are the advantages of using green manure in coffee farming a3 the recommended spacing for mango farming is 65 kg ha of phosphorus and 100 130 kg ha during the growing season and increase the number of coffee plants q3 how can farmers ensure the quality of coffee seedlings a3 receive adequate water for the coffee plants to ensure optimal growth and yield
GT  : <start> late blight in potato plants can h

### Compute BLEU on the full validation split

In [50]:
import random
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# pick 200 random indices
sample_idx = random.sample(range(len(val_q)), 200)

# build reference and candidate lists only for that subset
refs = [[val_resps[i].split()[1:-1]] for i in sample_idx]
cands = [decode_sequence(val_q[i:i+1]).split() for i in sample_idx]

smooth = SmoothingFunction().method1
bleu1 = corpus_bleu(refs, cands, weights=(1,0,0,0), smoothing_function=smooth)
bleu2 = corpus_bleu(refs, cands, weights=(0.5,0.5,0,0), smoothing_function=smooth)
bleu4 = corpus_bleu(refs, cands, weights=(0.25,0.25,0.25,0.25), smoothing_function=smooth)

print(f"Sample BLEU‑1: {bleu1:.4f}")
print(f"Sample BLEU‑2: {bleu2:.4f}")
print(f"Sample BLEU‑4: {bleu4:.4f}")



Sample BLEU‑1: 0.3948
Sample BLEU‑2: 0.2809
Sample BLEU‑4: 0.1852


### Beam‐search decoding
Greedy decoding can miss better overall sentences. Here’s a simple beam‐search implementatioN

In [51]:
import heapq
import numpy as np

def decode_beam(input_seq, beam_width=3):
    enc_outs, h, c = encoder_model.predict(input_seq, verbose=0)
    start_tok = start_token
    end_tok   = end_token

    # (negative log‑prob so smaller is better), token list, h, c
    sequences = [(0.0, [start_tok], h, c)]

    for _ in range(decoder_maxlen):
        all_cands = []
        for score, seq, h_prev, c_prev in sequences:
            if seq[-1] == end_tok:
                # already ended
                all_cands.append((score, seq, h_prev, c_prev))
                continue

            tgt = np.array([[seq[-1]]])
            preds, h_new, c_new = decoder_model.predict([tgt, h_prev, c_prev, enc_outs], verbose=0)
            probs = preds[0, -1, :]
            # pick top k
            for idx in np.argsort(probs)[-beam_width:]:
                new_score = score - np.log(probs[idx] + 1e-9)
                all_cands.append((new_score, seq + [idx], h_new, c_new))

        # keep top beam_width
        sequences = heapq.nsmallest(beam_width, all_cands, key=lambda x: x[0])

    # choose best
    best_seq = sequences[0][1]
    # convert IDs → words, strip start/end
    words = [tok.index_word.get(i, '<UNK>') for i in best_seq if i not in (start_tok, end_tok)]
    return ' '.join(words)

In [52]:
# Test it side‑by‑side:
for idx in [5, 25, 100]:
    inp = val_q[idx:idx+1]
    print("Q   :", val_texts[idx])
    print("Greedy:", decode_sequence(inp))
    print("Beam  :", decode_beam(inp, beam_width=5))
    print()

Q   : maize
Greedy: q1 what are some common pests and diseases that affect coffee plants a1 antestia bugs can be used to control pests in mango farming q2 what are the critical factors for successful promotion of banana farming a2 the use of resistant varieties for mango farming include single super phosphate ssp di ammonium phosphate and triple super phosphate tsp and phosphoric phosphorous acid the choice of a phosphatic fertilizer depends on the soil reaction ph q3 how can farmers ensure the quality of coffee seedlings a3 the right amount of inputs depends on the soil reaction ph q3 what are the advantages of using green manure in coffee farming a3 the recommended spacing for mango farming is 65 kg ha of phosphorus and 100 130 kg ha during the growing season and increase the number of coffee plants q3 how can farmers ensure the quality of coffee seedlings a3 receive adequate water for the coffee plants to ensure optimal growth and yield
Beam  : seems like that particular topic wasn 

### Convert your DataFrame into HF Dataset format

In [53]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [54]:
from datasets import Dataset

# Convert your cleaned DataFrame to an HF Dataset
ds = Dataset.from_pandas(
    df[['query_clean','response_clean']]
      .rename(columns={'query_clean':'input_text','response_clean':'target_text'})
)

# Split 90/10
ds = ds.train_test_split(test_size=0.1, seed=42)


In [55]:
from transformers import T5TokenizerFast
tokenizer = T5TokenizerFast.from_pretrained('t5-small')
def preprocess(batch):
    inp = tokenizer(batch['input_text'], truncation=True, padding='max_length', max_length=64)
    tgt = tokenizer(batch['target_text'], truncation=True, padding='max_length', max_length=256)
    inp['labels'] = tgt['input_ids']
    return inp
tokenized = ds.map(preprocess, batched=True, remove_columns=ds['train'].column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/21072 [00:00<?, ? examples/s]

Map:   0%|          | 0/2342 [00:00<?, ? examples/s]

In [56]:
!pip install --upgrade transformers datasets



In [57]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained('t5-small')
args = TrainingArguments(
    output_dir='t5_agri',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=100,
    # legacy evaluation & checkpointing
    eval_steps=500,               # run evaluation every 500 steps
    save_steps=500,               # save a checkpoint every 500 steps
    save_total_limit=2,
    # omit predict_with_generate
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    tokenizer=tokenizer,
    # you can still add a compute_metrics function here if you want BLEU/ROUGE
)
trainer.train()


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msammykush2020[0m ([33msammykush2020-cysparks[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,4.2335
200,2.5937
300,2.383
400,2.2014
500,2.1379
600,2.1065
700,2.0857
800,2.0739
900,2.0236
1000,2.0118


TrainOutput(global_step=3951, training_loss=1.9919171209004642, metrics={'train_runtime': 1965.1377, 'train_samples_per_second': 32.169, 'train_steps_per_second': 2.011, 'total_flos': 1069470915231744.0, 'train_loss': 1.9919171209004642, 'epoch': 3.0})

In [61]:
import torch

# Ensure model and tokenizer are already loaded and on the correct device
device = model.device

# 1) Beam-search generation
def generate_beam(text, num_beams=5):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outs = model.generate(
        **inputs,
        max_length=128,
        num_beams=num_beams,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )
    return tokenizer.decode(outs[0], skip_special_tokens=True)

# 2) Top-k/top-p sampling generation
def generate_sample(text, top_k=50, top_p=0.95, temp=0.9):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outs = model.generate(
        **inputs,
        max_length=128,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temp,
        repetition_penalty=2.0
    )
    return tokenizer.decode(outs[0], skip_special_tokens=True)

# 3) Interactive chat function
def chat_once():
    query = input("\nEnter your agri question:\n> ")
    beam_ans   = generate_beam(query, num_beams=5)
    sample_ans = generate_sample(query, top_k=50, top_p=0.95, temp=0.9)

    print("\n--- Responses:")
    print("Beam-search:\n", beam_ans, "\n")
    print("Sampling:\n", sample_ans, "\n")

# 4) Run the interactive prompt
chat_once()



Enter your agri question:
> how important is soil drainage for banana cultivation

--- Responses:
Beam-search:
 start> seems like that particular topic wasn t in my last update could you reframe that for me end> 

Sampling:
 start> it seems like there are many important reasons soil drainage is vital for banana trees that help in planting the mango trees to produce well organized soiling helps manage soil growth and reduces the spread of diseases as you can make sure all natural minerals from their vines should be separated with each other 3 why so many different nutrients add vitamin D supplements such as calcium magnesium and potassium potassium provide optimal drainage at a wide range 1 how can farmers understand what amene levels when harvesting banana tree seedlings do not have a drainage system around the bottom layer structure between banana cultivation 2 how can your cultivation balance food irrigation 



In [62]:
# 1) A simple cleaner to strip your marker tokens and tidy whitespace
def clean_text(text):
    return text.replace("start>", "").replace("end>", "").strip()

# 2) Updated beam-search with no_repeat_ngram_size
def generate_beam_clean(text, num_beams=5):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outs = model.generate(
        **inputs,
        max_length=128,
        num_beams=num_beams,
        repetition_penalty=2.5,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        early_stopping=True
    )
    raw = tokenizer.decode(outs[0], skip_special_tokens=False)
    return clean_text(raw)

# 3) Updated sampling with stronger penalties
def generate_sample_clean(text, top_k=50, top_p=0.9, temp=0.8):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outs = model.generate(
        **inputs,
        max_length=128,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temp,
        repetition_penalty=2.2,
        no_repeat_ngram_size=3
    )
    raw = tokenizer.decode(outs[0], skip_special_tokens=False)
    return clean_text(raw)

# 4) Try them out on your example
q = "how important is soil drainage for banana cultivation"
print("Beam  >", generate_beam_clean(q, num_beams=5))
print("Sample>", generate_sample_clean(q))

Beam  > <pad> <unk> seems like that particular topic wasn t in my last update could you reframe that for me <unk></s>
Sample> <pad> <unk> to reduce the use of soils in banana cultivation we should consider the importance of an organic environment for irrigation and drainage at the same time before planting plant trees you need regular nitrogen per 100 ml of oil into water and add new soils composting helps in managing disease spread by reducing the risk that is associated with farming practices using apical methods help in ensuring the soil can be used during drought cycles where some nutrients are needed must be provided have adequate nutrition 3 how often should i maintain soil fertility while maintaining good overall drinking conditions allow optimal growth after being planted as well as other fertilizers


In [63]:
def clean_text(text):
    # decode with skip_special_tokens and then tidy whitespace
    return (
        text
        .replace("<pad>", "")
        .replace("<unk>", "")
        .replace("</s>", "")
        .strip()
    )

def generate_beam_clean(text, num_beams=5):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outs = model.generate(
        **inputs,
        max_length=128,
        num_beams=num_beams,
        repetition_penalty=2.5,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        early_stopping=True
    )
    raw = tokenizer.decode(outs[0], skip_special_tokens=True)
    return clean_text(raw)

def generate_sample_clean(text, top_k=50, top_p=0.9, temp=0.8):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outs = model.generate(
        **inputs,
        max_length=128,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temp,
        repetition_penalty=2.2,
        no_repeat_ngram_size=3
    )
    raw = tokenizer.decode(outs[0], skip_special_tokens=True)
    return clean_text(raw)

# Test again
q = "how important is soil drainage for banana cultivation"
print("Beam  >", generate_beam_clean(q))
print("Sample>", generate_sample_clean(q))

Beam  > start> seems like that particular topic wasn t in my last update could you please reframe that for me end>
Sample> start> hey for banana cultivation you can consider the importance of soil drainage when using water in the soil this reduces soil fertility and promote higher soil quality farmers are making use of compost fertilizers to enhance soil yield by planting fruit or vegetable crops here are some important points that contribute to enhancing soil balance irrigation systems such as irrigation systems and irrigation services 1 how does soil formation assist with banana cultivation 2 what is the key role at each plant 7 how often should grape trees be used during the drought please press any number to ask the question end>


In [64]:
def clean_text(text):
    for tok in ["start>", "<start>", "end>", "<end>", "<pad>", "<unk>", "</s>", "<s>"]:
        text = text.replace(tok, "")
    # collapse multiple spaces and strip
    return " ".join(text.split())

In [65]:
# beam‐search clean
def generate_beam_clean(text, num_beams=5):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outs = model.generate(
        **inputs,
        max_length=128,
        num_beams=num_beams,
        repetition_penalty=2.5,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        early_stopping=True
    )
    raw = tokenizer.decode(outs[0], skip_special_tokens=True)
    return clean_text(raw)

# sampling clean
def generate_sample_clean(text, top_k=50, top_p=0.9, temp=0.8):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outs = model.generate(
        **inputs,
        max_length=128,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temp,
        repetition_penalty=2.2,
        no_repeat_ngram_size=3
    )
    raw = tokenizer.decode(outs[0], skip_special_tokens=True)
    return clean_text(raw)

# Test again
q = "how important is soil drainage for banana cultivation"
print("Beam  >", generate_beam_clean(q))
print("Sample>", generate_sample_clean(q))


Beam  > soil drainage plays a crucial role in banana cultivation here s how it helps soil drainage for banana cultivation depends on the information provided in the context soil drainage is a key component of soil drainage which can help in maintaining soil health and fertility improves soil quality ensures proper application of soil water to reduce the risk of pest diseases and disease spreads by reducing the need for fertilizer use of antibiotics used in banana farming 1 how often does soil drainage play a important role in potato cultivation 2 what are the benefits of using nutrient enriching bananas please press any number to ask the question
Sample> soil drainage plays a crucial role in banana cultivation here are some key points to consider when considering the importance of water and humidity for banana plants fertilizer can be applied over time by applying irrigation systems like weeds and irritants shrubbery composting can also be provided at different levels below the recomme

In [66]:
model.save_pretrained("t5_agri_final")
tokenizer.save_pretrained("t5_agri_final")

('t5_agri_final/tokenizer_config.json',
 't5_agri_final/special_tokens_map.json',
 't5_agri_final/spiece.model',
 't5_agri_final/added_tokens.json',
 't5_agri_final/tokenizer.json')

In [67]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [68]:
import gradio as gr

def respond(query):
    return generate_beam_clean(query)  # or generate_sample_clean

iface = gr.Interface(
    fn=respond,
    inputs=gr.Textbox(lines=2, placeholder="Ask about agri practices..."),
    outputs="text",
    title="Kenya Agri Chatbot",
    description="Fine-tuned T5 for agricultural Q&A"
)

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7b2a7208178b896b6c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


