#### Import necessary libraries and data.

In [None]:
# Import necessary libraries
import os
import textract
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define the path to the main directory
directory = "textractOutputs"

#### Define preprocessing function.

In [None]:
def preprocess_text(text): # tokenize, lowercase, lemmatize and remove stopwords.
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]   
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

### Go through each .txt file in /textractOutputs and process it. Add file name, folder of origin, and the processed text itself into corresponding columns.

In [None]:
preprocessed_documents = []
original_folders = []
file_names = []

for subdir, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith(".txt"):
            
            filepath = os.path.join(subdir, file)
            folder_name = os.path.basename(subdir)
            
            with open(filepath, 'r', encoding='utf-8') as f:
                text = f.read()
                
                # Preprocess the text
                preprocessed_text = preprocess_text(text)
                
                # Store the preprocessed text and filename
                preprocessed_documents.append(preprocessed_text)
                original_folders.append(folder_name)
                file_names.append(file)

#### Sav

In [None]:
df = pd.DataFrame({
    'file_name': file_names,
    'folder_name': folder_names,
    'preprocessed_text': preprocessed_documents
})

# Save to CSV
df.to_csv('preprocessed_texts.csv', index=False)