In [1]:
from google.colab import drive
drive.mount('/content/drive')
#SAVE_PATH = "/content/drive/MyDrive/imdb-bert2"


Mounted at /content/drive


In [2]:
# Step 1: Install Required Libraries
!pip install transformers datasets scikit-learn pandas tqdm



In [3]:

# Imports
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cpu


In [None]:
# Step 4: Load Dataset
df = pd.read_csv("agri_classification_dataset.csv")
#df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.1, random_state=42
)



In [None]:
import os

In [None]:
# Step 6: Tokenize (save if not already saved)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
torch.save(train_encodings, f"{SAVE_PATH}/train_encodings.pt")
torch.save(val_encodings, f"{SAVE_PATH}/val_encodings.pt")



In [None]:
# Step 7: Dataset Class
class IMDBDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
    def __len__(self):
        return len(self.labels)

train_dataset = IMDBDataset(train_encodings, train_labels)
val_dataset = IMDBDataset(val_encodings, val_labels)


In [None]:
# Step 8: Model & Training Setup
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

optimizer = AdamW(model.parameters(), lr=1e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import classification_report
from transformers import get_scheduler



# ✅ Add Scheduler
num_epochs = 5
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=num_training_steps,
)

# ✅ Training loop with scheduler
for epoch in range(num_epochs):
    try:
        model.train()
        print(f"🔁 Epoch {epoch + 1}/{num_epochs}")
        loop = tqdm(train_loader)
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()  # ✅ Step the scheduler
            loop.set_description(f"Epoch {epoch + 1}")
            loop.set_postfix(loss=loss.item())

        # ✅ Save after each epoch
        model.save_pretrained(f"{SAVE_PATH}/checkpoint-epoch{epoch+1}")
        tokenizer.save_pretrained(f"{SAVE_PATH}/checkpoint-epoch{epoch+1}")
        print(f"✅ Model saved after epoch {epoch+1}")

    except Exception as e:
        print("❌ Error occurred, saving current checkpoint...")
        model.save_pretrained(f"{SAVE_PATH}/checkpoint-epoch{epoch+1}-crash")
        tokenizer.save_pretrained(f"{SAVE_PATH}/checkpoint-epoch{epoch+1}-crash")
        raise e

# ✅ Evaluation after training
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in tqdm(val_loader, desc="🔍 Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\n📊 Classification Report on Validation Set:")
print(classification_report(all_labels, all_preds, target_names=["Non-agriculture", "Agriculture"]))


🔁 Epoch 1/5


  0%|          | 0/11250 [00:00<?, ?it/s]

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Change this to your desired checkpoint
checkpoint_path = "/content/drive/MyDrive/imdb-bert1/checkpoint-epoch3-fine-tuned-fine-tuned"


tokenizer = BertTokenizer.from_pretrained(checkpoint_path)
model = BertForSequenceClassification.from_pretrained(checkpoint_path)
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [5]:
def predict_sentiment(text):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        label = "Agriculture" if pred == 1 else "Non-agriculture"
        confidence = probs[0][pred].item()

    return label, confidence


In [16]:
examples = [
    "My wheat crops are drying up even though the soil is moist—could it be a fungus or something else?",
    "The recent Bollywood film touched upon farmer suicides, quite a heavy topic.",
    "How do I switch from chemical to organic farming without losing my yield?",
    "Do you know the actor who played the farmer in that Marathi biopic?",
    "Water stress in maize during tasseling stage is affecting cob formation—any recommendations?",
    "Just watched a documentary on climate change and its effects on agriculture in the Himalayan belt.",
    "I heard neem-based pesticides are good, but I don’t know the dilution ratio for chili crops.",
    "Can excessive rainfall delay flowering in pulses like urad and moong?",
    "My neighbors used some new fertigation technique; is that viable for sugarcane in clay soil?",
    "The storyline was confusing, but the cinematography in the paddy fields looked stunning!",
    ]

for text in examples:
    label, conf = predict_sentiment(text)
    print(f"📝 Review: {text}\n🔎 Prediction: {label} ({conf*100:.2f}% confidence)\n")


📝 Review: My wheat crops are drying up even though the soil is moist—could it be a fungus or something else?
🔎 Prediction: Agriculture (98.36% confidence)

📝 Review: The recent Bollywood film touched upon farmer suicides, quite a heavy topic.
🔎 Prediction: Non-agriculture (99.78% confidence)

📝 Review: How do I switch from chemical to organic farming without losing my yield?
🔎 Prediction: Agriculture (98.80% confidence)

📝 Review: Do you know the actor who played the farmer in that Marathi biopic?
🔎 Prediction: Non-agriculture (95.95% confidence)

📝 Review: Water stress in maize during tasseling stage is affecting cob formation—any recommendations?
🔎 Prediction: Agriculture (99.69% confidence)

📝 Review: Just watched a documentary on climate change and its effects on agriculture in the Himalayan belt.
🔎 Prediction: Non-agriculture (91.25% confidence)

📝 Review: I heard neem-based pesticides are good, but I don’t know the dilution ratio for chili crops.
🔎 Prediction: Agriculture (99.46%

In [17]:
predict_sentiment( "My wheat crops are drying up even though the soil is moist—could it be a fungus or something else?")

('Agriculture', 0.9883082509040833)

In [21]:
pd.set_option('display.max_columns', None)
print(df.head(3))


   Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3  Unnamed: 4     Unnamed: 5  \
0         NaN       Year      Month        Day       State       District   
1         NaN       2025       June         23  Tamil Nadu  Ramanthapuram   
2         NaN       2025       June         23  Tamil Nadu  Ramanthapuram   

    Unnamed: 6 Unnamed: 7 Unnamed: 8  Unnamed: 9  \
0       Sector     Season       Crop  Sl No. - Q   
1  Agriculture     Kharif     Chilli          Q1   
2  Agriculture     Kharif     Chilli          Q2   

                                         Unnamed: 10 Unnamed: 11  \
0                                           Question  Sl No. - A   
1  How much FYM-Farm yard manure should I add to ...          A1   
2  What are the market opportunities I have if I ...          A2   

                                         Unnamed: 12  
0                                             Answer  
1  Apply 8–10 tons per acre of well-decomposed FY...  
2  Chillies have market opportunities in foo

In [19]:
import pandas as pd
from tqdm import tqdm  # ✅ Import tqdm

# 🔽 Load your CSV file
df = pd.read_csv("twitter_dataset.csv")  # Replace with actual file path

# 🔽 Run prediction on the 'Text' column
agri_count = 0
predictions = []
confidences = []

# ✅ tqdm for progress bar
for question in tqdm(df['Text'], desc="🔍 Predicting", unit="question"):
    label, conf = predict_sentiment(question)
    predictions.append(label)
    confidences.append(conf)
    if label == "Agriculture":
        agri_count += 1

# 🔽 Add to DataFrame
df['predicted_label'] = predictions
df['confidence'] = confidences

# 🔽 Print the count
print(f"🌾 Total Agriculture-related questions: {agri_count} out of {len(df)}")

# 🔽 Save results
df.to_csv("questions_with_predictions.csv", index=False)
print("✅ Saved predictions to questions_with_predictions.csv")


🔍 Predicting: 100%|██████████| 10000/10000 [01:51<00:00, 89.40question/s]


🌾 Total Agriculture-related questions: 0 out of 10000
✅ Saved predictions to questions_with_predictions.csv


In [22]:
import pandas as pd
from tqdm import tqdm  # ✅ Import tqdm

# 🔽 Load your CSV file
df = pd.read_csv("Copy of Tamilnadu Crops by Ponugoti Kavya_Chilli-Tamil Nadu.csv")  # Replace with actual file path

# 🔽 Run prediction on the 'Text' column
agri_count = 0
predictions = []
confidences = []

# ✅ tqdm for progress bar
for question in tqdm(df['Unnamed: 10'], desc="🔍 Predicting", unit="question"):
    label, conf = predict_sentiment(question)
    predictions.append(label)
    confidences.append(conf)
    if label == "Agriculture":
        agri_count += 1

# 🔽 Add to DataFrame
df['predicted_label'] = predictions
df['confidence'] = confidences

# 🔽 Print the count
print(f"🌾 Total Agriculture-related questions: {agri_count} out of {len(df)}")

# 🔽 Save results
df.to_csv("questions_with_predictions.csv", index=False)
print("✅ Saved predictions to questions_with_predictions.csv")


🔍 Predicting: 100%|██████████| 202/202 [00:02<00:00, 100.58question/s]

🌾 Total Agriculture-related questions: 202 out of 202
✅ Saved predictions to questions_with_predictions.csv





In [10]:
agri_df = df[df['predicted_label'] == "Agriculture"]
agri_df['label'] = 0
agri_df[['Text', 'label']].to_csv("fine_tune_patch.csv", index=False)


In [11]:
import pandas as pd

# Load existing file
df = pd.read_csv("fine_tune_patch.csv")

# ✅ Add a new agriculture row
new_row = {
    "Text": "My wheat crops are drying up even though the soil is moist—could it be a fungus or something else?",
    "label": 1
}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

# 🔁 Add more rows like this if you want
more_rows = [
    {"Text": "Water scarcity is severely affecting my maize production this season.", "label": 1},
    {"Text": "What fertilizers are best for increasing rice yield in loamy soil?", "label": 1}
]
df = pd.concat([df, pd.DataFrame(more_rows)], ignore_index=True)

# Save back to CSV
df.to_csv("fine_tune_patch.csv", index=False)


In [12]:
with open("fine_tune_patch.csv", "a") as f:
    f.write("""My tomato plants have yellowing leaves despite regular watering,1
What's the best organic pesticide for controlling aphids in okra?,1
Is it okay to plant sugarcane during the pre-monsoon showers?,1
How can I improve soil fertility in fields that were overused with urea?,1
My papaya plants are showing signs of leaf curl disease,1
What’s the ideal spacing for planting brinjal seedlings?,1
Why is my banana crop not flowering even after applying potash?,1
Can I grow groundnuts in sandy loam soil?,1
Looking for high-yielding hybrid paddy varieties for my next sowing season,1
How to control stem borer in paddy using integrated pest management?,1
What is the optimal temperature range for wheat germination?,1
My guava tree is dropping fruit prematurely—what could be the reason?,1
Which crops can I interplant with maize to boost yield?,1
Facing powdery mildew in my cucumber crop—need a remedy,1
Can excessive irrigation affect flowering in cotton plants?,1
I’m testing vermicompost this season—what should I monitor?,1
Best practices for transplanting rice seedlings manually?,1
Why are my chili plants showing curled leaves and stunted growth?,1
Should I rotate pulses after harvesting mustard?,1
Using bio-fertilizers for the first time in paddy—any tips?,1
""")


In [13]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd

# Load model & tokenizer
model = BertForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = BertTokenizer.from_pretrained(checkpoint_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
# Load your correction dataset
df = pd.read_csv("fine_tune_patch.csv")
texts = df['Text'].tolist()
labels = df['label'].tolist()

# Tokenize
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)

# Dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
    def __len__(self):
        return len(self.labels)

dataset = CustomDataset(encodings, labels)
loader = DataLoader(dataset, batch_size=4, shuffle=True)


In [15]:
from torch.optim import AdamW
from tqdm import tqdm  # ✅ Progress bar

optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

for epoch in range(num_epochs):
    print(f"🚀 Epoch {epoch+1}/{num_epochs}")
    model.train()
    loop = tqdm(loader, desc=f"Epoch {epoch+1}", unit="batch")

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

    print(f"✅ Epoch {epoch+1} completed.\n")

# Save the fine-tuned model
model.save_pretrained(f"{checkpoint_path}-fine-tuned")
tokenizer.save_pretrained(f"{checkpoint_path}-fine-tuned")
print("✅ Model saved after fine-tuning.")


🚀 Epoch 1/3


Epoch 1: 100%|██████████| 36/36 [00:03<00:00, 11.07batch/s, loss=0.000253]


✅ Epoch 1 completed.

🚀 Epoch 2/3


Epoch 2: 100%|██████████| 36/36 [00:02<00:00, 12.32batch/s, loss=0.000198]


✅ Epoch 2 completed.

🚀 Epoch 3/3


Epoch 3: 100%|██████████| 36/36 [00:02<00:00, 13.00batch/s, loss=6.06e-5]


✅ Epoch 3 completed.

✅ Model saved after fine-tuning.


In [15]:
predict_sentiment("Today I have a meeting with prime minister")

('Non-agriculture', 0.6770172119140625)

In [26]:
import pandas as pd

df = pd.read_csv("agri_classification_dataset.csv")  # Replace with your actual filename


In [27]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.9, random_state=42
)


In [29]:
from tqdm import tqdm

# Tokenize each text one by one with tqdm
input_ids = []
attention_masks = []

for text in tqdm(test_texts, desc="🔄 Tokenizing test set"):
    encoded = tokenizer.encode_plus(
        text,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    input_ids.append(encoded['input_ids'][0])
    attention_masks.append(encoded['attention_mask'][0])

# Now stack them
import torch
test_encodings = {
    'input_ids': torch.stack(input_ids),
    'attention_mask': torch.stack(attention_masks)
}



🔄 Tokenizing test set: 100%|██████████| 90000/90000 [04:01<00:00, 372.88it/s]


In [30]:
from torch.utils.data import Dataset, DataLoader
import torch

class AgriDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

test_dataset = AgriDataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)  # Use larger batch size for faster inference


In [32]:
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    loop = tqdm(test_loader, desc="🔍 Evaluating", unit="batch")
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# 📊 Classification report
print("📊 Classification Report on 90K test data:")
print(classification_report(all_labels, all_preds, target_names=["Non-agriculture", "Agriculture"]))

# 🎯 F1-score
f1 = f1_score(all_labels, all_preds)
print(f"🎯 F1 Score: {f1:.4f}")


🔍 Evaluating: 100%|██████████| 5625/5625 [43:38<00:00,  2.15batch/s]


📊 Classification Report on 90K test data:
                 precision    recall  f1-score   support

Non-agriculture       1.00      1.00      1.00     45046
    Agriculture       1.00      1.00      1.00     44954

       accuracy                           1.00     90000
      macro avg       1.00      1.00      1.00     90000
   weighted avg       1.00      1.00      1.00     90000

🎯 F1 Score: 0.9997


In [33]:
!pip install gradio --quiet

import gradio as gr
import torch

# Function to predict label
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        label = "Agriculture" if pred == 1 else "Non-agriculture"
        confidence = probs[0][pred].item()
        return f"{label} ({confidence*100:.2f}%)"

# Gradio UI
demo = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=4, placeholder="Enter your text here..."),
    outputs="text",
    title="🌾 Agriculture Detector",
    description="Paste your text below and find out whether it's agriculture-related or not."
)

# Launch
demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e4f61fd4b2f319a650.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


