This notebook creates a 500k sample of the Producers Direct Farmers dataset. Then NLP Semantic Model is used to categorize the questions for Challenge 2. The output is a csv of the 500k rows with categorized labels, ready for analysis. There's also code at the bottom to create a profile report for quick EDA.

In [None]:
import pandas as pd
#from ydata_profiling import ProfileReport

from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
# Load the Parquet file
df = pd.read_parquet('/kaggle/input/producersdirectdata-parquet')

# Filter rows where question_language is 'eng'
filtered_df = df[df['question_language'] == 'eng']

filtered_df = filtered_df.reset_index(drop=True)

In [None]:
#sampling random 500k rows for speed

sampled_df = filtered_df.sample(n=500000, random_state=42)

#sampled_df

In [None]:
##identify unique question types &  develop summary categories

#unique_question_topic = sampled_df['question_topic'].unique().tolist()

#print(unique_question_topic)

In [None]:
## can see a wide range of topics such as: fruits, vegetables, animals, insects, flowers - use an NLP approach to categorize them automatically

##semantic similarity model - testing on small subest before running fully
#by testing on small subset, and re randomizing, I can create new categories based on responses
#splitting 500k into 10 unique batches for quicker inference on semtantic model

# drop na's
#sampled_df = sampled_df.dropna(subset=['question_topic'])
sampled_df = sampled_df.dropna(subset=['question_content'])

# Define categories
categories = ["livestock", "harvesting", "planting", "pests", "markets", "fruits", "vegetables", "seeds", "nuts", "weather", "equipment",
             "soil", "vaccines", "raising livestock"]

# Load model
model = SentenceTransformer('/kaggle/input/all-minilm-l6-v2')

# Encode categories once
category_embeddings = model.encode(categories, convert_to_tensor=True)

# Split into 10 unique random samples of 50k rows
chunk_size = 50000
chunks = []
remaining_df = sampled_df.copy()

for _ in range(10):
    sample = remaining_df.sample(n=chunk_size, random_state=42)
    remaining_df = remaining_df.drop(sample.index)
    chunks.append(sample)

# Function to classify a batch
def classify_batch(df_chunk):
    question_embeddings = model.encode(df_chunk['question_content'].tolist(), convert_to_tensor=True, batch_size=32)
    similarities = util.cos_sim(question_embeddings, category_embeddings)
    df_chunk['predicted_category'] = [categories[torch.argmax(sim).item()] for sim in similarities]
    return df_chunk

# Process each chunk
classified_chunks = [classify_batch(chunk) for chunk in chunks]

# Combine results
final_df = pd.concat(classified_chunks, ignore_index=True)
final_df.to_csv('classified_questions_output.csv', index=False)


In [None]:
# reviewing output before full run
review_df = sampled_df[['question_topic',  'question_content', 'predicted_category','response_content']]

review_df.tail(25)

In [None]:
##code to build the pandas report
profile = ProfileReport(sampled_df, title="EDA Report", explorative=True)
profile.to_notebook_iframe()  # If you're in a Jupyter notebook

In [None]:
profile.to_file("eda_report_producers_direct_farmers.html")