In [1]:
import pandas as pd
from tqdm import tqdm
import time
from openai import OpenAI
import os

In [2]:
# read the original data file
df = pd.read_csv("IDS.csv")

In [3]:
# index
INPUT_FILE = "IDS.csv"
PROGRESS_FILE = "Filtered_WPS_Progress.csv"
FINAL_FILE = "Filtered_WPS_Final.csv"
SAVE_EVERY = 100
DESC_COL = "Overall Submission Description (may describe multiple incidents)"

In [4]:
# keep human related + U.S incidents
df.columns = df.columns.str.strip()
df["Country"] = df["Country"].str.strip().str.upper()
df_human = df[df["Impact of Incident"].str.contains("H", na=False)]
df_human = df_human[df_human["Country"] == "US"]
df_human = df_human.reset_index(drop=True)

In [5]:
print("Filtered dataset size:", len(df_human))

Filtered dataset size: 13011


In [6]:
tqdm.pandas()

In [7]:
client = OpenAI(
    api_key="sk-df9f5cc150e746e7a2d93ac1db2cfeb2",
    base_url="https://api.deepseek.com"
)

In [8]:
# save the progress
results = []
start_index = 0
if os.path.exists(PROGRESS_FILE):
    existing = pd.read_csv(PROGRESS_FILE)
    results = existing.to_dict("records")
    start_index = len(existing)
    print(f"🔁 Resuming from row {start_index}")
else:
    print(f"🚀 Starting from scratch")

🚀 Starting from scratch


In [9]:
# check product type
def is_pesticide_product(product_name):
    prompt = f"""
You are a chemical safety and pesticide regulation expert.

Given the name of a chemical product or mixture, determine whether this chemical is likely used as a **pesticide or agricultural chemical** that could be subject to **Worker Protection Standard (WPS)** enforcement.

Consider the chemical "included" if it is used in agricultural fields, greenhouses, or orchards to control pests, diseases, or weeds.

Chemical:
{product_name}

Respond only with:
- "Yes"
- "No"
"""
    for attempt in range(3):
        try:
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"⚠️ Product check error: {e} (attempt {attempt+1}/3)")
            time.sleep(5)
    return "ERROR"

In [10]:
# filter wps related
def is_wps_incident(description):
    prompt = f"""
You are a domain expert in agricultural labor safety and pesticide regulation.

Given a self-reported incident description, determine whether the case is relevant to the **Worker Protection Standard (WPS)**.

Consider it relevant if:
- A human (especially a worker, field laborer, pesticide handler) was affected
- The incident occurred in an agricultural context (e.g., farm, greenhouse, orchard, nursery)

If the report clearly involves most or all of the above, respond "Yes". If not, respond "No".

Incident Description:
{description}
"""
    for attempt in range(3):
        try:
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"⚠️ WPS check error: {e} (attempt {attempt+1}/3)")
            time.sleep(5)
    return "ERROR"

In [11]:
# main loop: filter the is_pesticide_product then is_wps_incident
tqdm.pandas()
for i in tqdm(range(start_index, len(df_human))):
    row = df_human.iloc[i]
    product = str(row["Product Names"])
    description = str(row[DESC_COL])

    pesticide_check = is_pesticide_product(product)
    if pesticide_check.lower() == "yes":
        wps_check = is_wps_incident(description)
    else:
        wps_check = "No"

    # save result
    result_row = row.to_dict()
    result_row["is_pesticide_related"] = pesticide_check
    result_row["is_wps_related"] = wps_check
    results.append(result_row)

    # save the progress every 100 lines
    if (i + 1) % SAVE_EVERY == 0 or (i + 1) == len(df_human):
        temp_df = pd.DataFrame(results)
        temp_df.to_csv(PROGRESS_FILE, index=False)
        print(f"✅ Progress saved at row {i+1} to {PROGRESS_FILE}")

  0%|                                                                                                                                           | 9/13011 [01:48<43:23:24, 12.01s/it]


KeyboardInterrupt: 

In [None]:
# result
final_df = df_pesticide[df_pesticide["is_wps_related"].str.lower() == "yes"]
final_df.to_csv("Filtered_WPS_Final.csv", index=False)
print("Done. Saved to Filtered_WPS_Final.csv")