# Chunk 6 - Without Staff (WOS) - Processing

## Brief Introduction

This section covers issues from the Center for Children's Books. The primary differentiator is that, although every issue in this section has a reviewing staff for each book review, none of these issues contain a list of reviewing staff for that issue—thus, "without staff". Furthermore, in these issues, the coded symbol appears right between the metadata text and the review text for any given book review

## Imports

In [5]:
import os
import shutil
import csv
import re
import pandas as pd
import openpyxl

## Preprocessing

1. The first preprocessing step involves identifying the last instance of the line "New Titles for Children and Young People." We specifically look for the last instance because an issue can sometimes have multiple occurrences of this line in its introduction. Regardless of the number of instances, the introduction always ends with this line.

In [2]:
def remove_introduction(source_dir, target_dir, target_line):
    """
    Processes text files by removing the introduction up to and including the last instance 
    of the target line, then saves the modified content to the target directory.

    Args:
        source_dir (str): The directory containing the source text files.
        target_dir (str): The directory to save the processed text files.
        target_line (str): The line that marks the end of the introduction.
    """
    # Create target directory if it doesn't exist
    os.makedirs(target_dir, exist_ok=True)
    
    # Initialize the counter for the target line occurrences and a list for files without the line
    line_count = 0
    files_without_line = []
    
    # Process each file in the source directory
    for filename in os.listdir(source_dir):
        if filename.endswith('.txt'):
            # Construct full paths for source and target files
            source_path = os.path.join(source_dir, filename)
            target_path = os.path.join(target_dir, filename)
            
            # Read the content of the source file
            with open(source_path, 'r', encoding='utf-8') as file:
                content = file.readlines()
            
            # Find the last occurrence of the target line
            last_index = -1
            for i, line in enumerate(content):
                if target_line in line:
                    last_index = i
                    line_count += 1
            
            # Process the file based on the presence of the target line
            if last_index != -1:
                # Write content after the last occurrence of the target line to the new file
                with open(target_path, 'w', encoding='utf-8') as file:
                    file.writelines(content[last_index + 1:])
            else:
                # Add the filename to the list if the target line was not found
                files_without_line.append(filename)
            
            # Print the processing message
            print(f"Processed {filename}")
    
    # Print the total number of occurrences of the target line
    print(f"Total number of '{target_line}' occurrences found: {line_count}")
    # Print the names of files that did not contain the target line
    if files_without_line:
        print("Files without the target line:")
        for file_name in files_without_line:
            print(file_name)

In [None]:
# Specify the directory and line to process
source_directory = 'TXT'
target_directory = 'TXT_first_format'
search_line = "New Titles for Children and Young People"

# Call the function
remove_introduction(source_directory, target_directory, search_line)

2. The second preprocessing step involves manually removing the extraneous content that appears after the completion of book reviews in every issue.

3. The third preprocessing step is to remove any instances of lines that contain the page number. In almost every case, the page number is the only content present in the given line.

In [3]:
def remove_page_number_lines(source_dir, target_dir):
    """
    Removes lines containing page numbers from text files in the source directory 
    and saves the processed content to the target directory.

    Args:
        source_dir (str): The directory containing the source text files.
        target_dir (str): The directory to save the processed text files.
    """
    # Create target directory if it doesn't exist
    os.makedirs(target_dir, exist_ok=True)
    
    # Define the regex pattern to match lines with page numbers
    page_number_regex = re.compile(r"^\s*(\[\s*\d{1,3}\s*\]|\[\s*\d{1,3})")

    # Iterate through each file in the source directory
    for filename in os.listdir(source_dir):
        if filename.endswith('.txt'):
            # Construct full paths for source and target files
            source_path = os.path.join(source_dir, filename)
            target_path = os.path.join(target_dir, filename)
            
            # Read the content of the source file
            with open(source_path, 'r', encoding='utf-8') as file:
                content = file.readlines()
            
            # Create a new list to store content without lines containing page numbers
            new_content = []
            for line in content:
                stripped_line = line.strip()
                if not page_number_regex.match(stripped_line):
                    new_content.append(line)
            
            # Write the processed content to the new file
            with open(target_path, 'w', encoding='utf-8') as file:
                file.writelines(new_content)
            
            # Print the processing message
            print(f"Processed {filename}")

In [None]:
# Define the input and output folder paths
input_folder = 'TXT_second_format'
output_folder = 'TXT_third_format'

# Run the processing
remove_page_number_lines(input_folder, output_folder)

4. The fourth preprocessing step is to remove any instances of lines that contain "C.U." and "D.V".

In [62]:
def remove_cu_dv_lines(input_file_path, output_file_path):
    """
    Processes a text file by removing lines that start with "C.U." or "D.V." and,
    in specific cases, the lines that follow them.

    Args:
        input_file_path (str): The path to the input text file.
        output_file_path (str): The path to save the processed text file.

    Returns:
        None
    """
    with open(input_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    output_lines = []
    i = 0

    while i < len(lines):
        line = lines[i].strip()
        if line.startswith("C.U."):
            # Check if the line 2 lines after this starts with "D.V."
            if i + 2 < len(lines) and lines[i + 2].strip().startswith("D.V."):
                # Skip the "C.U." line, the line after it, and the "D.V." line
                i += 3
            else:
                # Skip the "C.U." line
                i += 1
        elif line.startswith("D.V."):
            # Skip the "D.V." line
            i += 1
        else:
            output_lines.append(lines[i])
            i += 1

    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.writelines(output_lines)

In [None]:
# Define the input and output folder paths
input_folder_path = 'TXT_third_format'
output_folder_path = 'TXT_fourth_format'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Process each txt file in the input folder
for filename in os.listdir(input_folder_path):
    if filename.endswith('.txt'):
        input_file_path = os.path.join(input_folder_path, filename)
        output_file_path = os.path.join(output_folder_path, filename)
        print(f"Processing file: {input_file_path}")
        
        if os.path.exists(input_file_path):
            try:
                remove_cu_dv_lines(input_file_path, output_file_path)
                print(f'Processed and saved: {output_file_path}')
            except Exception as e:
                print(f"An error occurred while processing {input_file_path}: {e}")
        else:
            print(f"Text file not found: {input_file_path}")

print("Processing complete.")

5. The fifth preprocessing step involves moving any line containing a coded symbol to the end of its respective metadata content. By "the end," we mean making it the very last line.

In [64]:
# Define coded symbols and grade year pattern
coded_symbols = ["R*", "R", "Ad", "M", "NR", "SpC", "SpR"]
coded_symbols_pattern = re.compile(r'^\s*(' + '|'.join(re.escape(symbol) for symbol in coded_symbols) + r')\s*$')
grade_year_pattern = re.compile(r'^\s*(?:[Ka-zA-Z]-\d+|\d+-[a-zA-Z]*|\d+-\d*|\d+-|.*(?:Gr\.|yrs\.).*)\s*$')

In [65]:
def process_file(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    i = 0
    while i < len(lines) - 1:
        line = lines[i].strip()
        next_line = lines[i + 1].strip()
        
        if coded_symbols_pattern.match(line):
            if grade_year_pattern.match(next_line) or '$' in next_line:
                # Interchange the lines
                lines[i], lines[i + 1] = lines[i + 1], lines[i]
                i += 2  # Move past the next line
            else:
                i += 1
        else:
            i += 1

    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.writelines(lines)

In [None]:
# Define the input and output folder paths
input_folder_path = 'TXT_fourth_format'
output_folder_path = 'TXT_fifth_format'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Process each txt file in the input folder
for filename in os.listdir(input_folder_path):
    if filename.endswith('.txt'):
        input_file_path = os.path.join(input_folder_path, filename)
        output_file_path = os.path.join(output_folder_path, filename)
        print(f"Processing file: {input_file_path}")
        
        if os.path.exists(input_file_path):
            try:
                process_file(input_file_path, output_file_path)
                print(f'Processed and saved: {output_file_path}')
            except Exception as e:
                print(f"An error occurred while processing {input_file_path}: {e}")
        else:
            print(f"Text file not found: {input_file_path}")

print("Processing complete.")

6. Our final preprocessing step involves formatting the file names, to only include volumeNumber_issueNumber_year.

In [67]:
def format_file_names(source_dir, target_dir):
    # Create target directory if it doesn't exist
    os.makedirs(target_dir, exist_ok=True)
    
    # Iterate through each file in the source directory
    for filename in os.listdir(source_dir):
        if filename.endswith('.txt'):
            # Split the filename and extract the relevant parts
            parts = filename.split('_')
            new_filename = '_'.join(parts[7:10]) + '.txt'  # Construct the new filename from parts

            # Construct full paths to source and target files
            source_path = os.path.join(source_dir, filename)
            target_path = os.path.join(target_dir, new_filename)
            
            # Copy the file with the new name to the target directory
            shutil.copy(source_path, target_path)
            print(f"Processed and renamed {filename} to {new_filename}")

In [None]:
# Specify the source and target directories
source_directory = 'TXT_fifth_format'
target_directory = 'TXT_preprocessed'

# Call the function
format_file_names(source_directory, target_directory)

## Calculating the actual number of reviews per file

In [69]:
def count_actual_number_of_reviews(source_dir, output_file, coded_symbols):
    # Initialize a list to hold the results
    results = []
    
    # Iterate through each file in the source directory
    for filename in os.listdir(source_dir):
        if filename.endswith('.txt'):
            file_id = filename.replace('.txt', '')  # Remove the .txt part for the ID
            symbol_count = 0
            
            # Construct full path to the file
            file_path = os.path.join(source_dir, filename)
            
            # Open and read the file to count the coded symbols
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    if line.strip() in coded_symbols:
                        symbol_count += 1
            
            # Append the result for this file to the results list
            results.append([file_id, symbol_count])
    
    # Write results to the CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['id', 'count'])  # Write column names
        writer.writerows(results)
    print("CSV file has been created with the count of coded symbols.")

In [None]:
# Specify the source directory and output file path
source_directory = 'TXT_preprocessed'
output_csv_file = 'actual_number_of_reviews.csv'

# Call the function
count_actual_number_of_reviews(source_directory, output_csv_file, coded_symbols)

## Processing

We'll be performing 2 steps here: 
1. Divide each issue into (0, n) number of book reviews
2. Subsequently, bifurcate each book review into:
   1. metadata.txt: containing the metadata of the review
   2. review.txt: containing the review portion

### 1. First, let's find the books in every issue.

In [87]:
# Define regex patterns
author_regex = re.compile(r"^\s*([A-Za-z,-]+(?:\s[A-Za-z,-]+)*)(?:\s*\([^)]*\))?\.")
reviewer_regex = re.compile(r".*\b([A-Z]{2,3})\.?$")


In [89]:
def identify_books(input_file_path, output_folder_path):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    book_counter = 0
    i = 0
    while i < len(lines):
        # Find the start of the book review
        if author_regex.match(lines[i]):
            print(f"Book start found at line {i}: {lines[i]}")
            book_counter += 1
            book_content = []
            # Collect the book content until the reviewer line
            while i < len(lines) and not reviewer_regex.match(lines[i]):
                book_content.append(lines[i].strip())
                i += 1
            # Include the reviewer line in the book content
            if i < len(lines) and reviewer_regex.match(lines[i]):
                book_content.append(lines[i].strip())
                print(f"Reviewer found at line {i}: {lines[i]}")
                i += 1

            # Create folder for the current book
            book_folder = os.path.join(output_folder_path, f'Book_{book_counter}')
            os.makedirs(book_folder, exist_ok=True)
            
            # Write book content to book.txt
            with open(os.path.join(book_folder, 'book.txt'), 'w', encoding='utf-8') as file:
                file.write('\n'.join(book_content))
            print(f"Book {book_counter} saved with {len(book_content)} lines.")
        else:
            i += 1


In [None]:
# Define the input and output folder paths
input_folder_path = 'TXT_preprocessed'
output_parent_folder = 'Books_identified'

# Create the output parent folder if it doesn't exist
if not os.path.exists(output_parent_folder):
    os.makedirs(output_parent_folder)

# Process each txt file in the input folder
for filename in os.listdir(input_folder_path):
    if filename.endswith('.txt'):
        input_file_path = os.path.join(input_folder_path, filename)
        output_folder_path = os.path.join(output_parent_folder, filename.replace('.txt', ''))
        print(f"Processing file: {input_file_path}")
        
        if os.path.exists(input_file_path):
            try:
                identify_books(input_file_path, output_folder_path)
                print(f'Processed and saved: {output_folder_path}')
            except Exception as e:
                print(f"An error occurred while processing {input_file_path}: {e}")
        else:
            print(f"Text file not found: {input_file_path}")

print("Processing complete.")


### 2. Next, lets divide each book into metadata and review text files.

In [94]:
coded_symbols = ["R*", "R", "Ad", "M", "NR", "SpC", "SpR"]
# Define regex patterns
coded_symbols_pattern = re.compile(r'^\s*(' + '|'.join(re.escape(symbol) for symbol in coded_symbols) + r')\s*$')
reviewer_regex = re.compile(r"\b([A-Z]{2,3})\.?$")


In [95]:
def divide_and_move_reviewer(book_folder):
    book_file_path = os.path.join(book_folder, 'book.txt')
    with open(book_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    metadata = []
    review = []
    found_coded_symbol = False
    reviewer_initials = ""

    for line in lines:
        if not found_coded_symbol:
            metadata.append(line.strip())
            if coded_symbols_pattern.match(line):
                found_coded_symbol = True
        else:
            review.append(line.strip())

    # Move reviewer initials to metadata
    if review:
        last_line = review[-1]
        match = reviewer_regex.search(last_line)
        if match:
            reviewer_initials = match.group(1)
            review[-1] = reviewer_regex.sub("", last_line).strip()

    # Write metadata to metadata.txt
    if reviewer_initials:
        metadata.append(reviewer_initials)
    
    with open(os.path.join(book_folder, 'metadata.txt'), 'w', encoding='utf-8') as file:
        file.write('\n'.join(metadata))
    
    # Write review to review.txt
    with open(os.path.join(book_folder, 'review.txt'), 'w', encoding='utf-8') as file:
        file.write('\n'.join(review))
    
    # Remove the original book.txt file
    os.remove(book_file_path)


In [96]:
# Define the input and output folder paths
input_folder_path = 'Books_identified'
output_parent_folder = 'Books_bifurcated'

# Create the output parent folder if it doesn't exist
if not os.path.exists(output_parent_folder):
    os.makedirs(output_parent_folder)

# Process each book folder in the input folder
for folder_name in os.listdir(input_folder_path):
    book_folder_path = os.path.join(input_folder_path, folder_name)
    if os.path.isdir(book_folder_path):
        output_folder_path = os.path.join(output_parent_folder, folder_name)
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)
        
        for book_name in os.listdir(book_folder_path):
            book_path = os.path.join(book_folder_path, book_name)
            if os.path.isdir(book_path):
                output_book_folder = os.path.join(output_folder_path, book_name)
                if not os.path.exists(output_book_folder):
                    os.makedirs(output_book_folder)
                
                # Copy the book.txt file to the output folder
                book_file_path = os.path.join(book_path, 'book.txt')
                if os.path.exists(book_file_path):
                    with open(book_file_path, 'r', encoding='utf-8') as file:
                        book_content = file.readlines()
                    
                    output_book_file_path = os.path.join(output_book_folder, 'book.txt')
                    with open(output_book_file_path, 'w', encoding='utf-8') as file:
                        file.writelines(book_content)
                    
                    # Divide the book into metadata.txt and review.txt, then move reviewer initials
                    divide_and_move_reviewer(output_book_folder)

print("Processing complete.")

Processing complete.


3. ### Calculating the number of reviews which could not be captured

First, lets count the obtained number of book reviews per issue, and save it to a csv file.

In [97]:
def count_obtained_number_of_reviews(source_dir, output_file):
    results = []
    
    # Iterate over each issue folder in the source directory
    for issue_folder in os.listdir(source_dir):
        issue_path = os.path.join(source_dir, issue_folder)
        if os.path.isdir(issue_path):  # Ensure it's a directory
            count = len([name for name in os.listdir(issue_path) if 'Book_' in name and os.path.isdir(os.path.join(issue_path, name))])
            results.append([issue_folder, count])
    
    # Write the results to a CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['id', 'count'])
        writer.writerows(results)
    print("CSV file has been created with the count of book folders.")

In [98]:
# Specify the source directory and output file path
source_directory = 'Books_identified'
output_csv_file = 'obtained_number_of_reviews.csv'

# Call the function
count_obtained_number_of_reviews(source_directory, output_csv_file)

CSV file has been created with the count of book folders.


Next, lets calculate the difference, per file, between the actual number of reviews and the obtained number of reviews.

In [99]:
def calculate_difference(actual_file, obtained_file, output_file):
    # Load data from CSV files
    def load_data(filename):
        with open(filename, newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            next(reader)  # Skip the header
            return {rows[0]: int(rows[1]) for rows in reader}

    actual_counts = load_data(actual_file)
    obtained_counts = load_data(obtained_file)
    
    # Compare counts and calculate differences
    results = []
    for id in actual_counts:
        actual_count = actual_counts.get(id, 0)
        obtained_count = obtained_counts.get(id, 0)
        difference = actual_count - obtained_count
        results.append([id, difference])
    
    # Write the results to a CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['id', 'difference'])
        writer.writerows(results)
    print("Comparison CSV file has been created.")

In [100]:
# Specify the file paths
actual_csv_file = 'actual_number_of_reviews.csv'
obtained_csv_file = 'obtained_number_of_reviews.csv'
output_comparison_csv = 'review_count_comparison.csv'

# Call the function
calculate_difference(actual_csv_file, obtained_csv_file, output_comparison_csv)

Comparison CSV file has been created.


### 4. Next, let's flatten the bifurcated books folder to aid in further processing

In [101]:
def flatten_folders(source_dir, target_dir):
    # Ensure the target directory exists
    os.makedirs(target_dir, exist_ok=True)
    
    # Iterate over each issue folder
    for issue_folder in os.listdir(source_dir):
        issue_path = os.path.join(source_dir, issue_folder)
        
        # Check if it's a directory
        if os.path.isdir(issue_path):
            # Iterate over each book folder inside the issue folder
            for book_folder in os.listdir(issue_path):
                book_path = os.path.join(issue_path, book_folder)
                
                if os.path.isdir(book_path) and book_folder.startswith('Book_'):
                    # Construct the new folder name based on the issue and book number
                    book_number = book_folder.split('_')[1]
                    new_folder_name = f"{issue_folder}_{book_number.zfill(3)}"
                    new_folder_path = os.path.join(target_dir, new_folder_name)
                    
                    # Create the new folder
                    os.makedirs(new_folder_path, exist_ok=True)
                    
                    # Copy the files (review.txt and metadata.txt) to the new folder
                    for file in os.listdir(book_path):
                        src_file_path = os.path.join(book_path, file)
                        dst_file_path = os.path.join(new_folder_path, file)
                        shutil.copy(src_file_path, dst_file_path)
    
    print("Folders have been flattened and files have been copied.")

In [102]:
# Specify the source directory and target directory
source_directory = 'Books_bifurcated'
target_directory = 'Books_bifurcated_flattened'

# Call the function to flatten the folders
flatten_folders(source_directory, target_directory)

Folders have been flattened and files have been copied.


## Metadata Consolidation

In [103]:
# Define the directory containing the parent folders
parent_dir = "Books_bifurcated_flattened"

# Base Excel file path
base_excel_file_path = "consolidated_metadata.xlsx"

# Define regex patterns for extracting data
author_regex = re.compile(r"^(.*?), (.*?)(?=\.)")
book_name_regex = re.compile(r"\. (.*?)[.;]")
publisher_year_regex = re.compile(r"\.\s+([^.,]+), (\d{4})")
price_regex = re.compile(r"(\$\d+\.?\d*)")
isbn_regex = re.compile(r"(ISBN [\d-]+(?:,? ?[\d-]+)*).*?[.;]", re.DOTALL)
code_regex = re.compile(r"^(?:[*]|R|Ad|M|NR|SpC|SpR)$", re.MULTILINE)
reviewer_regex = re.compile(r"^([A-Z]{2,3})\.*$", re.MULTILINE)
grade_years_regex = re.compile(r"^(.*(?:Gr\.|yrs\.).*)$", re.MULTILINE)
illustrator_regex = re.compile(r"(illus\..*?)\.")
pages_regex = re.compile(r"\b(\d{1,3}p)\b")
square_bracket_regex = re.compile(
    r"\[\d{1,2}\]"
)  # Regex to find numbers within square brackets

# Initialize lists for storing data rows and various conditions
rows = []
empty_book_name = []
empty_author_name = []
empty_publisher = []
empty_year = []
erroneous_price = []
multiple_initials = []

# Iterate over each folder in the parent directory
for folder in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, folder)
    metadata_path = os.path.join(folder_path, "metadata.txt")

    if os.path.exists(metadata_path):
        with open(metadata_path, "r", encoding="utf-8") as meta_file:
            metadata = meta_file.read()
            temp_metadata = " ".join(
                metadata.splitlines()
            )  # Create flattened metadata for publisher and year

            # Remove lines containing numbers within square brackets from both metadata and temp_metadata
            metadata = re.sub(square_bracket_regex, "", metadata)
            temp_metadata = re.sub(square_bracket_regex, "", temp_metadata)

            # Extract information using regex
            author_match = author_regex.search(metadata)
            book_name_match = book_name_regex.search(metadata)
            publisher_year_match = publisher_year_regex.search(temp_metadata)
            price_matches = price_regex.findall(metadata)
            isbn_matches = isbn_regex.findall(metadata)
            code_match = code_regex.search(metadata)
            reviewer_match = reviewer_regex.search(metadata)
            grade_years_matches = grade_years_regex.findall(metadata)
            illustrator_match = illustrator_regex.search(metadata)
            pages_match = pages_regex.search(metadata)

            # Prepare data row
            row = [
                folder,
                publisher_year_match.group(2) if publisher_year_match else "",
                book_name_match.group(1) if book_name_match else "",
                (
                    author_match.group(1) + ", " + author_match.group(2)
                    if author_match and author_match.groups()
                    else ""
                ),
                publisher_year_match.group(1) if publisher_year_match else "",
                ", ".join(price_matches),
                ", ".join(isbn_matches).replace("\n", " "),
                code_match.group(0) if code_match else "",
                reviewer_match.group(1) if reviewer_match else "",
                ", ".join(set(grade_years_matches)),
                illustrator_match.group(0) if illustrator_match else "",
                pages_match.group(0) if pages_match else "",
            ]

            # Add row to main list and check conditions for filtering
            rows.append(row)
            if not row[2]:  # Empty Book Name
                empty_book_name.append(row)
            if not row[3]:  # Empty Author Name
                empty_author_name.append(row)
            if not row[4]:  # Empty Publisher
                empty_publisher.append(row)
            if not row[1]:  # Empty Year
                empty_year.append(row)
            if row[5].count("$") > 2:  # Erroneous Price
                erroneous_price.append(row)
            elif len(row[2]) == 1:  # Book Names only 1 letter long
                multiple_initials.append(row)

In [104]:

# Convert lists to pandas DataFrames
df_all_rows = pd.DataFrame(
    rows,
    columns=[
        "ID",
        "Year",
        "Book Name",
        "Author Name",
        "Publisher",
        "Price",
        "ISBN",
        "Code",
        "Reviewer",
        "Grade Years",
        "Illustrator",
        "Pages",
    ],
)

df_empty_book_name = pd.DataFrame(empty_book_name, columns=df_all_rows.columns)
df_empty_author_name = pd.DataFrame(empty_author_name, columns=df_all_rows.columns)
df_empty_publisher = pd.DataFrame(empty_publisher, columns=df_all_rows.columns)
df_empty_year = pd.DataFrame(empty_year, columns=df_all_rows.columns)
df_erroneous_price = pd.DataFrame(erroneous_price, columns=df_all_rows.columns)
df_multiple_initials = pd.DataFrame(multiple_initials, columns=df_all_rows.columns)

# Filter the main dataframe to exclude rows present in other dataframes
filtered_rows_df = df_all_rows[
    ~df_all_rows["ID"].isin(
        pd.concat(
            [
                df_empty_author_name["ID"],
                df_erroneous_price["ID"],
                df_empty_publisher["ID"],
                df_empty_year["ID"],
                df_erroneous_price["ID"],
                df_multiple_initials["ID"],
            ]
        )
    )
]

# Write to Excel file with multiple sheets using pandas
with pd.ExcelWriter(base_excel_file_path, engine="openpyxl") as writer:
    df_all_rows.to_excel(writer, sheet_name="all_rows", index=False)
    filtered_rows_df.to_excel(writer, sheet_name="complete", index=False)
    df_empty_book_name.to_excel(writer, sheet_name="book_missing", index=False)
    df_empty_author_name.to_excel(writer, sheet_name="author_missing", index=False)
    df_empty_publisher.to_excel(writer, sheet_name="publisher_missing", index=False)
    df_empty_year.to_excel(writer, sheet_name="year_missing", index=False)
    df_erroneous_price.to_excel(writer, sheet_name="price_multiple", index=False)
    df_multiple_initials.to_excel(writer, sheet_name="initials_multiple", index=False)

# Output the counts
print(f"Total rows before filtering: {len(rows)}")
print(f"Rows with empty book name: {len(empty_book_name)}")
print(f"Rows with empty author name: {len(empty_author_name)}")
print(f"Rows with empty publisher: {len(empty_publisher)}")
print(f"Rows with empty year: {len(empty_year)}")
print(f"Rows with erroneous price: {len(erroneous_price)}")
print(f"Rows with book names only 1 letter long: {len(multiple_initials)}")


Total rows before filtering: 3784
Rows with empty book name: 20
Rows with empty author name: 85
Rows with empty publisher: 55
Rows with empty year: 55
Rows with erroneous price: 6
Rows with book names only 1 letter long: 58


## Review Text Consolidation

In [None]:
# Load the Excel file
excel_file_path = 'complete.xlsx'
df = pd.read_excel(excel_file_path, sheet_name='complete')

# Path to the folder containing bifurcated folders
bifurcated_folder_path = 'Bifurcated_flattened'

# Create a folder to store reviews if it doesn't exist
reviews_folder_path = 'Reviews'
if not os.path.exists(reviews_folder_path):
    os.makedirs(reviews_folder_path)

# Iterate through each ID in the Excel sheet
for index, row in df.iterrows():
    id_name = str(row['ID'])  # Assuming 'ID' is the column name
    
    # Check if the folder exists for the current ID
    folder_path = os.path.join(bifurcated_folder_path, id_name)
    if os.path.exists(folder_path):
        # Check if review.txt exists in the folder
        review_file_path = os.path.join(folder_path, 'review.txt')
        if os.path.exists(review_file_path):
            # Rename and move the review.txt file to the Reviews folder
            new_file_name = id_name + '.txt'
            new_file_path = os.path.join(reviews_folder_path, new_file_name)
            shutil.copy(review_file_path, new_file_path)
            print(f"Review file for ID {id_name} copied and renamed.")
        else:
            print(f"Review file not found for ID {id_name}.")
    else:
        print(f"Folder not found for ID {id_name}.")

print("Process completed.")