In [44]:
import spacy
import json
import os

In [45]:
#python -m spacy download en_core_web_trf'

In [46]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

In [47]:
def return_subject(verb):
    for possible_subject in verb.children:
        if possible_subject.dep_ in ('nsubj', 'nsubjpass'):
            return possible_subject
    return None

In [48]:
def return_object(verb):
    for possible_object in verb.children:
        if possible_object.dep_ in ('dobj', 'dative', 'pobj'):
            if possible_object.dep_ == 'pobj' and possible_object.head.dep_ != 'prep':
                continue
            return possible_object
    return None

In [49]:
def return_svo(text):
    doc = nlp(text)
    svos = []
    for sent in doc.sents:
        for possible_verb in sent:
            if possible_verb.pos_ == 'VERB':
                subj = return_subject(possible_verb)
                obj = return_object(possible_verb)
                if subj and obj:
                    svos.append({
                        "subject": subj.text,
                        "verb": possible_verb.text,
                        "stem": possible_verb.lemma_,
                        "object": obj.text
                    })
    return svos

In [50]:
 def iterate_category(category_name, category_prefix, error_log_path):
    folder_path = f'./myDataset/{category_name}'  
    files = sorted(os.listdir(folder_path))[:100]  
    all_data = {}
    errors = []
    
    for idx, file_name in enumerate(files, start=1):
        file_path = os.path.join(folder_path, file_name)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                svos = return_svo(text)
                if svos:
                    all_data[f"{category_prefix}-{idx}"] = {
                        "SVO_relationships": svos,
                        "total_SVOs": len(svos)
                    }
                else:
                    errors.append(f"No SVOs found in file {file_name} ({category_prefix}-{idx})")
        except Exception as e:
            errors.append(f"Error processing file {file_name} ({category_prefix}-{idx}): {str(e)}")
    
    output_json_path = f'./myResult/Using_Spacy/{category_name}.json'  
    os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(all_data, json_file, indent=4, ensure_ascii=False)
    print(f"SVOs for {category_name} saved to {output_json_path}")

    # Write errors to log file
    with open(error_log_path, 'a', encoding='utf-8') as log_file:
        for error in errors:
            log_file.write(error + '\n')
    print(f"Errors for {category_name} logged to {error_log_path}")

In [51]:
# Error log path
error_log_path = './myResult/Using_Spacy/error_log.txt'
os.makedirs(os.path.dirname(error_log_path), exist_ok=True)

# Categories and their prefixes
categories = {
    "financial": "Fin",
    "literature": "Lit",
    "medical": "Med",
    "movies": "Mov",
    "news": "New",
}

iterate_category("financial", "Fin", error_log_path)
iterate_category("literature", "Lit", error_log_path)
iterate_category("medical", "Med", error_log_path)
iterate_category("movies", "Mov", error_log_path)
iterate_category("news", "New", error_log_path)

SVOs for financial saved to ./myResult/Using_Spacy/financial.json
Errors for financial logged to ./myResult/Using_Spacy/error_log.txt
SVOs for literature saved to ./myResult/Using_Spacy/literature.json
Errors for literature logged to ./myResult/Using_Spacy/error_log.txt
SVOs for medical saved to ./myResult/Using_Spacy/medical.json
Errors for medical logged to ./myResult/Using_Spacy/error_log.txt
SVOs for movies saved to ./myResult/Using_Spacy/movies.json
Errors for movies logged to ./myResult/Using_Spacy/error_log.txt
SVOs for news saved to ./myResult/Using_Spacy/news.json
Errors for news logged to ./myResult/Using_Spacy/error_log.txt
