In [21]:
import os
import spacy
import pandas as pd
import re

In [9]:
nlp = spacy.load("en_core_web_md")

In [10]:
# function that open and cleans the text files
def open_and_clean_text(filenames):
    # opens the files using a latin1 encoding which contains 191 characters from the Latin script, including finish letters    
    with open(filenames, encoding='latin1') as file:
        text = file.read()
        # the re.sub function finds matching occurrences of a specified pattern and replaces it with an empty string
        # r'\<[^>]*\>' is a regular expression which removes everything between the tags including empty tags
        text_cleaned = re.sub(r'\<[^>]*\>', "", text) 
    return text_cleaned

In [11]:
# function which extracts the noun, verbs, adjectives, adverbs and the three unique entities person, loc, and org
def extract_text_entities(doc):
        
        annotations = []
        # iterates over each token in doc, appending the text and pos tags to the list annotations
        # afterwards a dataframe is created of the list
        for token in doc: 
            annotations.append([token.text, token.pos_])
        # creates a pandas dataframe containing the text and the post
        df = pd.DataFrame(annotations, columns=["text", "pos"])

        # filtrates the df using a boolean vector (the boolean vector is created from a conditional statement, 
        # checking if each element in pos is part of the list ["NOUN", "VERB", "ADJ", "ADV"])
        df_keep = df[df['pos'].isin(["NOUN", "VERB", "ADJ", "ADV"])] 

        # groups the elements in in pos and counts the size of each group
        pos_count = df_keep.groupby("pos").count()

        # the same as before but this time, the "-" inverts the boolean vector removing the elements 
        # from pos which is found in the list ["SPACE", "SYM", "PUNCT", "NUM"]
        df_removed = df[-df['pos'].isin(["SPACE", "SYM", "PUNCT", "NUM"])] 

        # calculates the normalization factor of df_removed divided by 10000
        total_words = len(df_removed)/10000 
       
        # calculates the frequency of nouns, verbs, adjectives, and adverbs and rounds the number to a whole number 
        # cach POS frequency is normalized against the total word count for comparison
        noun = round(pos_count["text"]["NOUN"]/total_words)
        verb = round(pos_count["text"]["VERB"]/total_words)
        adj = round(pos_count["text"]["ADJ"]/total_words)
        adv = round(pos_count["text"]["ADV"]/total_words)

        # creates a set for each unique entities (set can only hold unique values)
        person, loc, org = set(), set(), set()
        # iterates over each entity in doc, using a conditional statement to filter the entities, 
        # then adding each entity to the sets (person, loc, org) based on their labels
        for entity in doc.ents:

            if entity.label_ == "PERSON":
                person.add(str(entity))

            elif entity.label_ == "LOC":
                loc.add(str(entity))
            
            elif entity.label_ == "ORG":
                org.add(str(entity))
       
        # returns an int for each pos and unique entity 
        return noun, verb, adj, adv, len(person), len(loc), len(org)

In [27]:
# defines the main function with a parameter for the folder path (run the script here)
def main(in_folderpath, out_folderpath):
    # creates a sorted list of all the directories within the given folder path
    dirs = sorted(os.listdir(in_folderpath))
    # iterates over each directory in the sorted list 'dirs'
    for directory in dirs:
         # defines column names for the DataFrame.
        columns = ["filename","NOUN", "VERB", "ADJ", "ADV", "PERSON", "LOC", "ORG"]
        # creating an empty dataframe for the final results
        final_results = pd.DataFrame(columns=columns)
        # creates a sorted list of all the filenames within each directory in the folder data
        filenames = sorted(os.listdir(os.path.join("..","in", directory)))

        # iterates over each text file in the sorted list 'filenames'.
        for text_file in filenames:
            # constructs the file path for each text file
            filenames = os.path.join("..","in", directory, text_file)
            # calls the open_and_clean_text(filenames) function 
            text_cleaned = open_and_clean_text(filenames)
            # processes the cleaned text using the nlp model (en_core_web_md)
            doc = nlp(text_cleaned)
            # calls the extract_text_entities(doc) fucntion
            noun, verb, adj, adv, person, loc, org = extract_text_entities(doc)
            # creates a list containing the results 
            results = [text_file, noun, verb, adj, adv, person, loc, org]
            # turns the results list into a dataframe using the predefined columns
            df_results = pd.DataFrame([results], columns=columns)
            # appends the results dataframe to the final results dataframe
            final_results = pd.concat([final_results, df_results])

        # saves the final results dataframe as a csv file in the out folder 
        final_results.to_csv(f"{os.path.join(out_folderpath, directory)}.table.csv", index=False)

if __name__ == "__main__":
     # calls the main function with two arguments "../data"  and "../out" defining the in- and out-folderpaths
    main("../in", "../out")
  