#### Import necessary libraries and data.

In [5]:
# Import necessary libraries
import os
import textract
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maryamzakiyya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maryamzakiyya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/maryamzakiyya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Define preprocessing function.

In [6]:
def preprocess_text(text): # tokenize, lowercase, lemmatize and remove stopwords.
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]   
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

### Go through each .txt file in /textractOutputs and process it. Add file name, folder of origin, and the processed text itself into corresponding columns.

In [11]:
def process_text_files(directory):
    preprocessed_documents = []
    original_folders = []
    file_names = []

    # Traverse through the specified directory
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                filepath = os.path.join(subdir, file)
                folder_name = os.path.basename(subdir)
                
                # Read and preprocess the text file
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()
                    preprocessed_text = preprocess_text(text)
                    
                    # Store data
                    preprocessed_documents.append(preprocessed_text)
                    original_folders.append(folder_name)
                    file_names.append(file)

    # Create DataFrame
    df = pd.DataFrame({
        'file_name': file_names,
        'folder_name': original_folders,
        'preprocessed_text': preprocessed_documents
    })
    
    return df


In [12]:
directory = 'textractOutputs/Comments Received After Field Review/Comments received after 7-31-20 posting_embeddedText'
df_1 = process_text_files(directory)
df_1

Unnamed: 0,file_name,folder_name,preprocessed_text
0,8-11-20 Kravitz and Hernandez_embedded.txt,Comments received after 7-31-20 posting_embedd...,august california department education n stree...
1,8-11-20 Misa_embedded.txt,Comments received after 7-31-20 posting_embedd...,misa fagasavali sent tuesday august ethnic stu...
2,8-11-20 Silverberg-Rajna_embedded.txt,Comments received after 7-31-20 posting_embedd...,shana sent monday august pm ethnic study cc em...
3,8-7-20 Schwartz_embedded.txt,Comments received after 7-31-20 posting_embedd...,ruth schwartz sent thursday august pm ethnic s...
4,8-11-20 Christie_embedded.txt,Comments received after 7-31-20 posting_embedd...,elizabeth christie sent tuesday august ethnic ...
...,...,...,...
633,8-10-20 Michael_embedded.txt,Comments received after 7-31-20 posting_embedd...,efraim michael sent monday august pm ethnic st...
634,8-11-20 Moreno_embedded.txt,Comments received after 7-31-20 posting_embedd...,melissa moreno sent tuesday august ethnic stud...
635,8-11-20 Fritsch_embedded.txt,Comments received after 7-31-20 posting_embedd...,sharon fritsch sent tuesday august pm ethnic s...
636,08-12-20 Amin_embedded.txt,Comments received after 7-31-20 posting_embedd...,jasmine amin sent wednesday august ethnic stud...


In [19]:
directory = 'textractOutputs/Comments Received After September 30, 2020/Comments received after 11-6-20 posting_embeddedText'
df_2 = process_text_files(directory)
df_2

Unnamed: 0,file_name,folder_name,preprocessed_text
0,11-13-20 Khan_embedded.txt,Comments received after 11-6-20 posting_embedd...,hareem khan sent thursday november pm ethnic s...
1,11-13-20 Solkovits_embedded.txt,Comments received after 11-6-20 posting_embedd...,g gsolk sent thursday november pm ethnic study...
2,11-13-20 Kaur and Singh_embedded.txt,Comments received after 11-6-20 posting_embedd...,november dear superintendent thurmond cde offi...
3,11-19-20 Lamont_embedded.txt,Comments received after 11-6-20 posting_embedd...,sami lamont sent wednesday november pm ethnic ...
4,11-18-20 Jensen_embedded.txt,Comments received after 11-6-20 posting_embedd...,sine hwang jensen sent tuesday november pm eth...
...,...,...,...
278,11-13-20 Cadji_embedded.txt,Comments received after 11-6-20 posting_embedd...,yahya josh cadji sent thursday november pm eth...
279,11-19-20 muslimsdocumenting_embedded.txt,Comments received after 11-6-20 posting_embedd...,iqc sent thursday november ethnic study ethnic...
280,11-12-20 Group Letter Support JIMENA_embedded.txt,Comments received after 11-6-20 posting_embedd...,california department education received submi...
281,11-12-20 Read_embedded.txt,Comments received after 11-6-20 posting_embedd...,priscilla read sent thursday november ethnic s...


In [20]:
directory = 'textractOutputs/Comments received after Third Field Review (after 1-21-21)_embeddedText'
df_3 = process_text_files(directory)
df_3

Unnamed: 0,file_name,folder_name,preprocessed_text
0,2-8-21 Lebsack_embedded.txt,Comments received after Third Field Review (af...,email redacted sent saturday february pm email...
1,2-16-21 Kuehnert_embedded.txt,Comments received after Third Field Review (af...,michael kuehnert sent tuesday february ethnic ...
2,1-25-21 Nicholls_embedded.txt,Comments received after Third Field Review (af...,tara nicholls email redacted sent monday janua...
3,2-19-21 Grigoryan_embedded.txt,Comments received after Third Field Review (af...,milena grigoryan sent friday february pm ethni...
4,1-22-21 Schreier-Fleming_embedded.txt,Comments received after Third Field Review (af...,maura sent friday january ethnic study ethnics...
...,...,...,...
185,1-24-21 Schenck_embedded.txt,Comments received after Third Field Review (af...,bram schenck sent saturday january pm ethnic s...
186,1-28-21 Melamed_embedded.txt,Comments received after Third Field Review (af...,shavit melamed sent wednesday january pm ethni...
187,1-27-21 Shore_embedded.txt,Comments received after Third Field Review (af...,noah sent tuesday january pm ethnic study ethn...
188,1-25-21 Torres_embedded.txt,Comments received after Third Field Review (af...,autumn torres sent sunday january pm ethnic st...


In [21]:
directory = 'textractOutputs/First Field Review (June - August 2019)/First Field Review (June - August 2019)_embeddedText'
df_4 = process_text_files(directory)
df_4

Unnamed: 0,file_name,folder_name,preprocessed_text
0,8-7-19 Peters_embedded.txt,First Field Review (June - August 2019)_embedd...,lorin peter sent tuesday august pm ethnic stud...
1,8-7-19 Pastcan_embedded.txt,First Field Review (June - August 2019)_embedd...,public input ethnic study model curriculum may...
2,8-13-19 Friedman_embedded.txt,First Field Review (June - August 2019)_embedd...,mona marley sent tuesday august pm ethnic stud...
3,8-9-19 Kronick_embedded.txt,First Field Review (June - August 2019)_embedd...,mel kronick sent friday august pm cfird subjec...
4,8-8-19 Vosicher_embedded.txt,First Field Review (June - August 2019)_embedd...,sally vosicher sent thursday august pm ethnic ...
...,...,...,...
4072,8-14-19 Collins_embedded.txt,First Field Review (June - August 2019)_embedd...,jerry collins sent wednesday august pm ethnic ...
4073,8-14-18 Dori_embedded.txt,First Field Review (June - August 2019)_embedd...,dori sent tuesday august pm ethnic study subje...
4074,8-13-19 Indiviglia_embedded.txt,First Field Review (June - August 2019)_embedd...,public input ethnic study model curriculum may...
4075,8-13-19 Arad_embedded.txt,First Field Review (June - August 2019)_embedd...,lilach arad sent tuesday august pm ethnic stud...


In [22]:
directory = 'textractOutputs/Second Field Review (Sept 2020)_embeddedText'
df_5 = process_text_files(directory)
df_5

Unnamed: 0,file_name,folder_name,preprocessed_text
0,9-7-20 Hofacre_embedded.txt,Second Field Review (Sept 2020)_embeddedText,message stephanie hofacre sent sunday septembe...
1,9-6-20 Hughes_embedded.txt,Second Field Review (Sept 2020)_embeddedText,sent sunday september pm cfird subject please ...
2,9-17-20 Kalfus_embedded.txt,Second Field Review (Sept 2020)_embeddedText,anat kalfus sent thursday september pm ethnic ...
3,9-24-20 Takapuka_embedded.txt,Second Field Review (Sept 2020)_embeddedText,sent wednesday september pm ethnic study ethni...
4,9-28-20 Quirk-Silva_embedded.txt,Second Field Review (Sept 2020)_embeddedText,state capitol po box sacramento ca filalifnrnh...
...,...,...,...
784,9-30-20 Lee and Fa 2 Attachment_embedded.txt,Second Field Review (Sept 2020)_embeddedText,asian american model minority myth theme histo...
785,9-16-20 Galper _embedded.txt,Second Field Review (Sept 2020)_embeddedText,giselle galper sent wednesday september ethnic...
786,9-30-20 gruen _embedded.txt,Second Field Review (Sept 2020)_embeddedText,ed gruen sent wednesday september pm ethnic st...
787,9-30-20 bayarea_embedded.txt,Second Field Review (Sept 2020)_embeddedText,bayarea chapter sent wednesday september pm et...


In [23]:
directory = 'textractOutputs/Third Field Review (Dec 2020 - Jan 2021)_embeddedText'
df_6 = process_text_files(directory)
df_6

Unnamed: 0,file_name,folder_name,preprocessed_text
0,1-20-21 Brott_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,june brott sent tuesday january pm ethnic stud...
1,1-20-21 Kuck_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,email redacted sent wednesday january ethnic s...
2,1-21-21 Klose_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,response ethnic study curriculum january unity...
3,1-21-21 Shoman_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,samia shoman sent thursday january ethnic stud...
4,1-21-21 Scheinman_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,phil scheinman sent thursday january pm ethnic...
...,...,...,...
799,12-29-20 Fine_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,celia fine sent monday december pm ethnic stud...
800,12-30-20 Gerber_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,mike gerber sent tuesday december pm joe nalve...
801,1-20-21 Goldstein_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,sondra goldstein sent tuesday january pm ethni...
802,1-19-21 Ayala_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,samantha ayala sent tuesday january pm ethnic ...


In [26]:
text_1 = df_1.copy()
text_2 = df_2.copy()
text_3 = df_3.copy()
text_4 = df_4.copy()
text_5 = df_5.copy()
text_6 = df_6.copy()

In [28]:
texts = pd.concat([text_1, text_2, text_3, text_4, text_5, text_6], axis=0, ignore_index=True)
texts

Unnamed: 0,file_name,folder_name,preprocessed_text
0,8-11-20 Kravitz and Hernandez_embedded.txt,Comments received after 7-31-20 posting_embedd...,august california department education n stree...
1,8-11-20 Misa_embedded.txt,Comments received after 7-31-20 posting_embedd...,misa fagasavali sent tuesday august ethnic stu...
2,8-11-20 Silverberg-Rajna_embedded.txt,Comments received after 7-31-20 posting_embedd...,shana sent monday august pm ethnic study cc em...
3,8-7-20 Schwartz_embedded.txt,Comments received after 7-31-20 posting_embedd...,ruth schwartz sent thursday august pm ethnic s...
4,8-11-20 Christie_embedded.txt,Comments received after 7-31-20 posting_embedd...,elizabeth christie sent tuesday august ethnic ...
...,...,...,...
6776,12-29-20 Fine_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,celia fine sent monday december pm ethnic stud...
6777,12-30-20 Gerber_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,mike gerber sent tuesday december pm joe nalve...
6778,1-20-21 Goldstein_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,sondra goldstein sent tuesday january pm ethni...
6779,1-19-21 Ayala_embedded.txt,Third Field Review (Dec 2020 - Jan 2021)_embed...,samantha ayala sent tuesday january pm ethnic ...


In [30]:
# Save to CSV
text_1.to_csv('text_1.csv', index=False)
text_2.to_csv('text_2.csv', index=False)
text_3.to_csv('text_3.csv', index=False)
text_4.to_csv('text_4.csv', index=False)
text_5.to_csv('text_5.csv', index=False)
text_6.to_csv('text_6.csv', index=False)
texts.to_csv('texts.csv', index=False)