**Libraries**

The *os* module is a built-in library that provides functions for interacting with the *operating system*.

The *re* module provides *regular expression* matching operations.

*Pandas* is a library used for data manipulation and analysis


In [7]:
import os
import re
import pandas as pd

**Token count**

Pre-processing and total count

In [12]:
#Token Count

def token_count(folder_path):
    token_list = []
    # Loop through all .txt files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                text = file.read().lower()
                text = re.sub(r'[^\w\s-]', '', text)  # keep hyphens, remove other punctuation
                tokens = re.findall(r'\b\w+(?:-\w+)*\b', text)
                token_list.extend(tokens)

    total_tokens = len(token_list)
    return total_tokens

Folder = "Docs_English"
token_count(Folder)

2253604

**Predicting keywords based on 'morphemes'**

The goal of this experiment is to extract pesticide names by using substrings (that may or may not be morphemes) from a list of strings. Based on the top 10 substrings ranging in size from *2 to 5* characters. This experiment is run 4 times, with a decreasing number of substrings used in each run (2-5, 3-5, 4-5, and 5).

In [29]:
def extract_matching_strings_from_txt(path, substrings):
    matches = set()
    with open(path, "r", encoding="utf-8") as file:
        text = file.read()
        text = re.sub(r'[^\w\s-]', '', text)
        words = re.findall(r"\b\w+(?:-\w+)*\b", text)  # extract all words
        for word in words:
            if any(sub in word for sub in substrings):
                matches.add(word)
    return matches

def scrape_txt_folder(folder_path, substrings):
    results = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            full_path = os.path.join(folder_path, filename)
            matched_words = extract_matching_strings_from_txt(full_path, substrings)
            if matched_words:
                results[filename] = matched_words
    return results

def merge_unique(found_matches):
    merged = set()
    for words in found_matches.values():
        merged.update(words)
    return sorted(merged)

def save_list_to_excel(string_list, filename="unique_matches1.xlsx"):
    df = pd.DataFrame(string_list, columns=["Matched Strings"])
    df.to_excel(filename, index=False)
    print(f"Excel file saved as: {filename}")



Excel file saved as: unique_matches1.xlsx


In [None]:
ground_truth = [
    "acephate", "akton", "aspon", "azamethiphos-oxon", "azinphos methyl", "azinphos-methyl",
    "azinphosmethyl", "bensulide", "bromophos", "bromophos-ethyl", "butonate", "cadusafos",
    "carbophenothion", "chlorethoxyfos", "chlorethoxyphos", "chlorfenvinphos", "chlorfenvinphos-oxon",
    "chlorpirifos", "chlorpyrifos", "chlorpyrifos oxon", "chlorpyrifos-methyl", "chlorpyrifos-oxon",
    "chlorpyriphos", "chlorpyrofos me", "chlorpyrofos methyl", "coumaphos", "crotoxyphos", "crufomate",
    "cyanophos", "cythioate", "DDVP", "DEF", "demeton", "demeton S", "demeton-methyl", "demeton-O",
    "demeton-S", "demeton-S-methyl", "dialifor", "diamidfos", "diazinon", "diazinon-oxon", "diazoxon",
    "dicapthon", "dichlofenthion", "dichlorvos", "dichrotophos", "dicrotophos", "dimethoate", "dioxathion",
    "disulfoton", "ebufos", "EPN", "ethephon", "ethion", "ethoprop", "ethoprophos", "ethyl paraoxon",
    "ethyl parathion", "ethylparaoxon", "ethylparathion", "etrimfos", "famfos", "famphos", "famphur",
    "fenamiphos", "fenchlorfos", "fenchlorphos", "fenitrooxon", "fenitrothion", "fensulfothion", "fenthion",
    "fonofos", "formothion", "glufosinate", "glyphosate", "heptenophos", "iprobenphos", "isazophos",
    "isoazinphos", "isodiazinon", "isofenchlorphos", "isofenitrothion", "isofenphos", "isomalathion",
    "isophenphos", "leptophos", "malaoxon", "malathion", "methamidaphos", "methamidofos", "methamidophos",
    "methidathion", "methyl chlorpyrifos", "methyl paraoxon", "methyl parathion", "methyl-paraoxon",
    "methyl-parathion", "methylchlorpyrifos", "methylparaoxon", "methylparathion", "mevinphos", "mipafox",
    "monocrotophos", "naled", "oxydemethon methyl", "oxydemeton methyl", "oxydemeton-methyl", "paraoxon",
    "parathion", "parathion-ethyl", "parathion-methyl", "parathionmethyl", "phorate", "phosalone",
    "phosfolan", "phosfon", "phosmet", "phosphamidon", "phostebupirim", "phoxim", "pirimiphos ethyl",
    "pirimiphos methyl", "pirimiphos-ethyl", "pirimiphos-methyl", "pirimphos me", "pirimphos methyl",
    "profenofos", "propetamphos", "prothiofos", "quinalphos", "ronnel", "sulfotep", "sulfotepp",
    "sulprofos", "tebupirimfos", "tebupirimphos", "temephos", "TEPP", "terbufos", "tetrachlorvinphos",
    "thiometon", "thiophos", "trans-phosphamidon", "tribufos", "trichlorfon"] #Ground truth

**Experiment 1**

Strings 2-5

In [None]:
substrings_to_search1 = ['os', 'on', 'ho', 'ph', 'th', 'fo', 'hi', 'et', 'io', 'ox', 
                        'pho', 'hos', 'thi', 'fos', 'hio', 'ion', 'chl', 'hlo', 'lor', 'met',
                        'phos', 'thio', 'hion', 'chlo', 'hlor', 'para', 'oxon', 'athi', 'opho', 'lorp',
                        'thion', 'chlor', 'athio', 'ophos', 'hlorp', 'nphos', 'lorpy', 'orpyr', 'demet', 'meton']

folder_path = "Docs_English"

found_matches1 = scrape_txt_folder(folder_path, substrings_to_search1)

all_unique_words1 = merge_unique(found_matches1)

save_list_to_excel(all_unique_words1)

Excel file saved as: unique_matches1.xlsx


In [None]:
def read_column_as_list(filename, column_name):
    df = pd.read_excel(filename)
    return df[column_name].dropna().astype(str).tolist()

def get_common_strings(list1, list2):
    return sorted(set(list1) & set(list2))

list1 = read_column_as_list(""unique_matches1.xlsx"", "Matched Strings")

true_positives1 = get_common_strings(list1, ground_truth)

count_ground_truth = len(ground_truth)

count_predictions1 = len(list1)

false_positives1 = count_predictions1 - len(true_positives1)

false_negatives1 = count_ground_truth - len(true_positives1)

print('## Experiment 1 - 2-5 ##\n')
print('Predicted words: ', count_predictions1)
print('Ground truth: ', count_ground_truth)
print('Matches:', len(true_positives1))
print('False positives: ', false_positives1) 
print('False negatives: ', false_negatives1)


## Experiment 1 - 2-5 ##

Predicted words:  41042
Ground truth:  146
Matches: 93
False positives:  40949
False negatives:  53


**Evaluation 1**

Precision, Recall and F1

In [65]:
def precision_recall(true_positives, false_positives, false_negatives):
    tp = len(true_positives)
    fp = false_positives
    fn = false_negatives

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    print('Precision: ', precision), print('Recall: ', recall), print('F1: ', f1)

precision_recall(true_positives1,false_positives1, false_negatives1)

Precision:  0.00226597144388675
Recall:  0.636986301369863
F1:  0.004515878411187725


**Experiment 2**

Strings 3-5

In [56]:
substrings_to_search2 = ['pho', 'hos', 'thi', 'fos', 'hio', 'ion', 'chl', 'hlo', 'lor', 'met',
                        'phos', 'thio', 'hion', 'chlo', 'hlor', 'para', 'oxon', 'athi', 'opho', 'lorp',
                        'thion', 'chlor', 'athio', 'ophos', 'hlorp', 'nphos', 'lorpy', 'orpyr', 'demet', 'meton']

folder_path = "Docs_English"

found_matches2 = scrape_txt_folder(folder_path, substrings_to_search2)

all_unique_words2 = merge_unique(found_matches2)

save_list_to_excel(all_unique_words2, filename='unique_matches2.xlsx')

Excel file saved as: unique_matches2.xlsx


In [None]:
list2 = read_column_as_list("unique_matches2.xlsx", "Matched Strings")

true_positives2 = get_common_strings(list2, ground_truth)

count_predictions2 = len(list2)

false_positives2 = count_predictions2 - len(true_positives2)

false_negatives2 = count_ground_truth - len(true_positives2)

print('## Experiment 2 - 3-5 ##\n')
print('Predicted words: ', count_predictions2)
print('Ground truth: ', count_ground_truth)
print('Matches:', len(true_positives2))
print('False positives: ', false_positives2) 
print('False negatives: ', false_negatives2)

## Experiment 2 - 3-5 ##

Predicted words:  19385
Ground truth:  146
Matches: 79
False positives:  19306
False negatives:  67


**Evaluation 2**

Precision, Recall and F1

In [66]:
precision_recall(true_positives2,false_positives2, false_negatives2)

Precision:  0.004075315965953057
Recall:  0.541095890410959
F1:  0.008089703548205417


**Experiment 3**

Strings 4-5

In [68]:
substrings_to_search3 = ['phos', 'thio', 'hion', 'chlo', 'hlor', 'para', 'oxon', 'athi', 'opho', 'lorp',
                        'thion', 'chlor', 'athio', 'ophos', 'hlorp', 'nphos', 'lorpy', 'orpyr', 'demet', 'meton']

folder_path = "Docs_English"

found_matches3 = scrape_txt_folder(folder_path, substrings_to_search3)

all_unique_words3 = merge_unique(found_matches3)

save_list_to_excel(all_unique_words3, filename='unique_matches3.xlsx')

Excel file saved as: unique_matches3.xlsx


In [None]:
list3 = read_column_as_list("unique_matches3.xlsx", "Matched Strings")

true_positives3 = get_common_strings(list3, ground_truth)

count_predictions3 = len(list3)

false_positives3 = count_predictions3 - len(true_positives3)

false_negatives3 = count_ground_truth - len(true_positives3)

print('## Experiment 3 - 4-5 ##\n')
print('Predicted words: ', count_predictions3)
print('Ground truth: ', count_ground_truth)
print('Matches:', len(true_positives3))
print('False positives: ', false_positives3) 
print('False negatives: ', false_negatives3)

## Experiment 2 - 3-5 ##

Predicted words:  10496
Ground truth:  146
Matches: 69
False positives:  10427
False negatives:  77


**Evaluation 3**

Precision, Recall and F1

In [70]:
precision_recall(true_positives3,false_positives3, false_negatives3)

Precision:  0.006573932926829268
Recall:  0.4726027397260274
F1:  0.012967487314414584


**Experiment 4**

Strings 5

In [71]:
substrings_to_search4 = ['thion', 'chlor', 'athio', 'ophos', 'hlorp', 'nphos', 'lorpy', 'orpyr', 'demet', 'meton']

folder_path = "Docs_English"

found_matches4 = scrape_txt_folder(folder_path, substrings_to_search4)

all_unique_words4 = merge_unique(found_matches4)

save_list_to_excel(all_unique_words4, filename='unique_matches4.xlsx')

Excel file saved as: unique_matches4.xlsx


In [76]:
list4 = read_column_as_list("unique_matches4.xlsx", "Matched Strings")

true_positives4 = get_common_strings(list4, ground_truth)

count_predictions4 = len(list4)

false_positives4 = count_predictions4 - len(true_positives4)

false_negatives4 = count_ground_truth - len(true_positives4)

print('## Experiment 4 - 5 ##\n')
print('Predicted words: ', count_predictions4)
print('Ground truth: ', count_ground_truth)
print('Matches:', len(true_positives4))
print('False positives: ', false_positives4) 
print('False negatives: ', false_negatives4)

## Experiment 4 - 5 ##

Predicted words:  3107
Ground truth:  146
Matches: 44
False positives:  3063
False negatives:  102


**Evaluation 4**

Precision, Recall and F1

In [77]:
precision_recall(true_positives4,false_positives4, false_negatives4)

Precision:  0.014161570646926296
Recall:  0.3013698630136986
F1:  0.02705195204426683
