# Import Libraries

In [None]:
from openpyxl import load_workbook, Workbook
import re
import os
import pandas as pd

# Prepare CSV

## Sort Chunk 6 by ID

In [72]:

# File names
input_file = "Chunk 6.xlsx"
output_file = "Chunk 6 Fully Sorted.xlsx"

# Custom sorting function for IDs
def custom_sort_key(id_str):
    # Regex to split numeric and alphabetical parts
    match = re.match(r"(.+?)_(\d+)([a-z]*)$", id_str)
    if match:
        prefix, number, suffix = match.groups()
        return (prefix, int(number), suffix)
    return (id_str, 0, "")

try:
    # Load the workbook and get the active worksheet
    wb = load_workbook(input_file)
    ws = wb.active

    # Read data from the worksheet into a list of dictionaries
    data = []
    header = [cell.value for cell in ws[1]]  # First row is the header
    for row in ws.iter_rows(min_row=2, values_only=True):  # Iterate over rows (skip header)
        data.append(dict(zip(header, row)))

    # Sort the data based on the ID column using the custom sort key
    sorted_data = sorted(data, key=lambda x: custom_sort_key(str(x['ID'])))

    # Create a new workbook and worksheet
    new_wb = Workbook()
    new_ws = new_wb.active

    # Write the header
    new_ws.append(header)

    # Write the sorted data
    for row in sorted_data:
        new_ws.append([row[col] for col in header])

    # Save the new workbook
    new_wb.save(output_file)
    print(f"Rows sorted and saved as '{output_file}'.")
except Exception as e:
    print(f"Error processing file: {e}")

Rows sorted and saved as 'Chunk 6 Fully Sorted.xlsx'.


## Update Chunk 6 IDs (convert all lettered IDs to proper number-only IDs)

In [73]:
from openpyxl import load_workbook, Workbook
import re

# File names
input_file = "Chunk 6 Fully Sorted.xlsx"
output_file = "Chunk 6 IDs Updated.xlsx"

# Custom sorting function for IDs
def custom_sort_key(id_str):
    """
    Extract prefix, numeric part, and suffix to sort numerically and alphabetically.
    """
    match = re.match(r"(.+?)_(\d+)([a-z]*)$", id_str)
    if match:
        prefix, number, suffix = match.groups()
        return (prefix, int(number), suffix or "")  # Treat numeric part as integer
    return (id_str, 0, "")  # Default for unmatched IDs

# Function to process and update IDs
def process_ids(data):
    updated_data = []
    existing_ids = set()  # Track IDs that have been processed

    for i, row in enumerate(data):
        current_id = row['ID']
        match = re.match(r"(.+?)_(\d+)([a-z]*)$", current_id)
        if match:
            base_id, number, suffix = match.groups()
            number = int(number)  # Convert numeric part to integer

            # If there's a suffix, adjust the ID and shift subsequent numbers
            if suffix:
                new_id = f"{base_id}_{number + 1:03}"
                while new_id in existing_ids:  # Ensure uniqueness locally
                    number += 1
                    new_id = f"{base_id}_{number + 1:03}"
                row['ID'] = new_id
                existing_ids.add(new_id)

                # Shift subsequent IDs if necessary
                for j in range(i + 1, len(data)):
                    next_row_id = data[j]['ID']
                    next_match = re.match(r"(.+?)_(\d+)([a-z]*)$", next_row_id)
                    if next_match:
                        next_base_id, next_number, next_suffix = next_match.groups()
                        if next_base_id == base_id and int(next_number) <= number:
                            number += 1
                            updated_id = f"{base_id}_{number + 1:03}"
                            data[j]['ID'] = updated_id
                            existing_ids.add(updated_id)
            else:
                existing_ids.add(current_id)

        updated_data.append(row)

    return updated_data

try:
    # Load the workbook and worksheet
    wb = load_workbook(input_file)
    ws = wb.active

    # Read data from the worksheet into a list of dictionaries
    data = []
    header = [cell.value for cell in ws[1]]  # First row is the header
    for row in ws.iter_rows(min_row=2, values_only=True):  # Iterate over rows (skip header)
        data.append(dict(zip(header, row)))

    # Sort the data based on the ID column using the custom sort key
    sorted_data = sorted(data, key=lambda x: custom_sort_key(str(x['ID'])))

    # Process and update IDs
    updated_data = process_ids(sorted_data)

    # Create a new workbook and worksheet
    new_wb = Workbook()
    new_ws = new_wb.active

    # Write the header
    new_ws.append(header)

    # Write the updated data
    for row in updated_data:
        new_ws.append([row[col] for col in header])

    # Save the new workbook
    new_wb.save(output_file)
    print(f"Updated IDs saved as '{output_file}'.")
except Exception as e:
    print(f"Error processing file: {e}")

Updated IDs saved as 'Chunk 6 IDs Updated.xlsx'.


## Remove unnecessary "gaps" in IDs. 
For eg if there's a 51 and then directly a 53, the latter must be changed to 52

In [74]:
from openpyxl import load_workbook, Workbook
import re

# File names
input_file = "Chunk 6 Fully Sorted.xlsx"
output_file = "Chunk 6 IDs Unique.xlsx"

# Custom sorting function for IDs
def custom_sort_key(id_str):
    """
    Extract prefix, numeric part, and suffix to sort numerically and alphabetically.
    """
    match = re.match(r"(.+?)_(\d+)([a-z]*)$", id_str)
    if match:
        prefix, number, suffix = match.groups()
        return (prefix, int(number), suffix or "")  # Treat numeric part as integer
    return (id_str, 0, "")  # Default for unmatched IDs

# Function to process and update IDs to ensure uniqueness
def process_ids(data):
    existing_ids = set()  # Track already-used IDs
    for i, row in enumerate(data):
        current_id = row['ID']
        match = re.match(r"(.+?)_(\d+)([a-z]*)$", current_id)
        if match:
            base_id, number, suffix = match.groups()
            number = int(number)  # Convert numeric part to integer

            # Generate a new ID if there is a suffix or conflict
            if suffix or current_id in existing_ids:
                new_id = f"{base_id}_{number:03}"
                while new_id in existing_ids:  # Ensure uniqueness
                    number += 1
                    new_id = f"{base_id}_{number:03}"
                row['ID'] = new_id  # Update the row's ID
                existing_ids.add(new_id)  # Add to the set of used IDs
            else:
                existing_ids.add(current_id)  # Add original ID if no conflict

    return data

try:
    # Load the workbook and worksheet
    wb = load_workbook(input_file)
    ws = wb.active

    # Read data from the worksheet into a list of dictionaries
    data = []
    header = [cell.value for cell in ws[1]]  # First row is the header
    for row in ws.iter_rows(min_row=2, values_only=True):  # Iterate over rows (skip header)
        data.append(dict(zip(header, row)))

    # Sort the data based on the ID column using the custom sort key
    sorted_data = sorted(data, key=lambda x: custom_sort_key(str(x['ID'])))

    # Process and update IDs to ensure uniqueness
    updated_data = process_ids(sorted_data)

    # Create a new workbook and worksheet
    new_wb = Workbook()
    new_ws = new_wb.active

    # Write the header
    new_ws.append(header)

    # Write the updated data
    for row in updated_data:
        new_ws.append([row[col] for col in header])

    # Save the new workbook
    new_wb.save(output_file)
    print(f"Updated IDs saved as '{output_file}'.")
except Exception as e:
    print(f"Error processing file: {e}")

Updated IDs saved as 'Chunk 6 IDs Unique.xlsx'.


# Extracting Reviews

### Extract reviews using consecutive metadata rows as anchors

In [76]:
# Define paths
chunk_file = "Chunk 6 IDs Unique.xlsx"           # Excel file with metadata
text_files_folder = "Files"           # Input folder containing text files
output_folder = "Reviews"             # Output folder for extracted reviews

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the Excel file
df = pd.read_excel(chunk_file, dtype=str)

# Helper function to clean and normalize text
def clean_text(text):
    return re.sub(r'\s+', ' ', str(text).strip())

# Helper function to check for metadata match
def is_match(line, value):
    return value and clean_text(value).lower() in clean_text(line).lower()

# Function to validate Price, Publisher, and Year in a block of lines
def validate_metadata(lines, start_idx, price_list, publisher, year):
    for offset in range(4):  # Check current line + next 3 lines
        if start_idx + offset < len(lines):
            nearby_line = clean_text(lines[start_idx + offset]).lower()

            # Check for any price match
            if any(price.strip() in nearby_line for price in price_list):
                return True

            # Check for Publisher-Year combo
            if publisher and year and f"{publisher.lower()}, {year}" in nearby_line:
                return True

            # Check year alone
            if year and year in nearby_line:
                return True
    return False

# Function to find metadata position
def find_metadata(lines, row, start_idx):
    title = clean_text(row['Title']).lower()
    author = clean_text(row['Author']).lower()
    price_list = clean_text(row['Price']).split(',') if pd.notna(row['Price']) else []  # Handle NaN safely
    publisher = clean_text(row['Publisher']) if pd.notna(row['Publisher']) else ""
    year = clean_text(row['Year']) if pd.notna(row['Year']) else ""

    idx = start_idx
    while idx < len(lines):
        line = clean_text(lines[idx]).lower()
        next_line = clean_text(lines[idx + 1]).lower() if idx + 1 < len(lines) else ""

        # Check for partial Title OR Author match
        if (title in line or title in next_line) or (author and (author in line or author in next_line)):
            # Validate metadata (price, publisher, or year) in the next 4 lines
            if validate_metadata(lines, idx, price_list, publisher, year):
                return idx  # Return the index where metadata starts
        idx += 1

    return -1

# Track the total number of "Review could not be extracted" cases
empty_review_count = 0

# Process each row
for i, row in df.iterrows():
    book_id = row['ID']
    title = clean_text(row['Title'])
    author = clean_text(row['Author']) if pd.notna(row['Author']) else ""
    price = clean_text(row['Price']) if pd.notna(row['Price']) else ""
    pages = clean_text(row['Pages']) if pd.notna(row['Pages']) else ""
    grade_years = clean_text(str(row['Grade Years'])) if pd.notna(row['Grade Years']) else ""
    publisher = clean_text(row['Publisher']) if pd.notna(row['Publisher']) else ""
    year = clean_text(row['Year']) if pd.notna(row['Year']) else ""

    # Extract the filename from the ID
    volume_file = "_".join(book_id.split("_")[:3]) + ".txt"
    volume_file_path = os.path.join(text_files_folder, volume_file)

    if not os.path.exists(volume_file_path):
        print(f"Text file not found for ID {book_id}. Skipping...")
        continue

    # Read the text file
    with open(volume_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Find the start index for current metadata
    start_index = find_metadata(lines, row, 0)
    if start_index == -1:
        print(f"Start not found for ID {book_id}. Saving placeholder.")
        empty_review_count += 1  # Increment counter if no valid review content
        with open(os.path.join(output_folder, f"{book_id}.txt"), 'w', encoding='utf-8') as file:
            file.write("Review could not be extracted.")
        continue

    # Find the end index based on next row
    end_index = len(lines)
    if i + 1 < len(df):
        next_row = df.iloc[i + 1]
        next_title, next_author = map(
            lambda x: clean_text(x) if pd.notna(x) else "", 
            [next_row['Title'], next_row['Author']]
        )
        for idx in range(start_index + 2, len(lines)):
            line = lines[idx]
            # Check if the next metadata (title or author) is found
            if is_match(line, next_title) or is_match(line, next_author):
                end_index = idx  # Stop at the start of the next metadata line
                break

    # Extract review text
    review_lines = lines[start_index + 1:end_index]

    # Remove lines containing specific metadata (only once per category)
    cleaned_lines = []
    removed_price = removed_pages = removed_grade = removed_publisher = False

    for line in review_lines:
        line_str = clean_text(str(line))  # Ensure all lines are strings
        if not removed_price and is_match(line_str, price):
            removed_price = True
            continue
        if not removed_pages and is_match(line_str, pages):
            removed_pages = True
            continue
        if not removed_grade and is_match(line_str, grade_years):
            removed_grade = True
            continue
        if not removed_publisher and is_match(line_str, publisher):
            removed_publisher = True
            continue
        cleaned_lines.append(line)

    # Combine cleaned lines
    review_text = "".join(cleaned_lines).strip()
    if not review_text:
        empty_review_count += 1  # Increment counter if no valid review content
        review_text = "Review could not be extracted."
        print(f"Start not found for ID {book_id}. Saving placeholder.")

    # Save the review text
    review_file_path = os.path.join(output_folder, f"{book_id}.txt")
    with open(review_file_path, 'w', encoding='utf-8') as file:
        file.write(review_text)

    print(f"Processed review for ID {book_id}")

# Print summary of empty reviews
print(f"\nTotal 'Review could not be extracted' files: {empty_review_count}")

# Save updated DataFrame
df.to_excel("Chunk 6 parsed.xlsx", index=False)

Processed review for ID 41_(01)_1987_001
Processed review for ID 41_(01)_1987_002
Processed review for ID 41_(01)_1987_003
Processed review for ID 41_(01)_1987_004
Processed review for ID 41_(01)_1987_005
Processed review for ID 41_(01)_1987_006
Processed review for ID 41_(01)_1987_007
Processed review for ID 41_(01)_1987_008
Processed review for ID 41_(01)_1987_009
Processed review for ID 41_(01)_1987_010
Processed review for ID 41_(01)_1987_011
Processed review for ID 41_(01)_1987_012
Processed review for ID 41_(01)_1987_013
Processed review for ID 41_(01)_1987_014
Processed review for ID 41_(01)_1987_015
Processed review for ID 41_(01)_1987_016
Processed review for ID 41_(01)_1987_017
Processed review for ID 41_(01)_1987_018
Processed review for ID 41_(01)_1987_019
Processed review for ID 41_(01)_1987_020
Processed review for ID 41_(01)_1987_021
Processed review for ID 41_(01)_1987_022
Processed review for ID 41_(01)_1987_023
Processed review for ID 41_(01)_1987_024
Processed review

# Processing reviews

## Identify presence of extraneous books and remove them from each review file

#### First pass with Author + Coded Symbol + "illus. by"

In [80]:
# Define input and output folders
input_folder = "Reviews"  # Folder containing original review text files
output_folder = "Reviews Processed"  # Folder to save processed reviews

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Regex patterns
author_regex = re.compile(
    r"^\s*([A-Z][A-Za-z'-]+(?:\s[A-Za-z'-]+)*,)"  # LastName or multi-word LastName (e.g., "De Leeuw,")
    r"\s([A-Z][A-Za-z'-]*"                       # FirstName (e.g., Adele)
    r"(?:\s[A-Z]\.)*"                            # Optional middle initials (e.g., "A.")
    r"(?:\s[A-Za-z'-]+)?)"                       # Optional middle/compound name (e.g., Adele Louise)
)
price_regex = re.compile(r"(\$\d+\.?\d*)")
pages_regex = re.compile(r"\b(\d{1,3}p)\b")
publisher_year_regex = re.compile(r"\.\s+([^.,]+),\s(\d{4})")  # Matches: Publisher, 1958
illus_by_regex = re.compile(r"illus\. by", re.IGNORECASE)  # Match 'illus. by' case-insensitively

# Coded symbols to match (case insensitive, whole line)
coded_symbols = ["R*", "R", "Ad", "M", "NR", "SpC", "SpR"]
coded_symbol_regex = re.compile(r"^\s*(" + "|".join(coded_symbols) + r")\s*$", re.IGNORECASE)

# Function to process a single file
def process_file(input_path, output_path, filename):
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    output_lines = []
    skip_flag = False  # Flag to stop processing after identifying conditions
    identified_lines = []  # Track identified lines for reporting

    for i, line in enumerate(lines):
        if i < 4:  # Skip the first 4 lines
            output_lines.append(line)
            continue
        
        if not skip_flag:
            # # Step 1: Check for coded symbols (as the ONLY content on a line)
            # if coded_symbol_regex.match(line):
            #     skip_flag = True  # Start skipping everything after this line
            #     identified_lines.append(line.strip())
            #     break  # Stop further checks on this line

            # Step 2: Check for 'illus. by' and validate with price/pages/publisher
            if illus_by_regex.search(line):
                valid_match = False
                for offset in range(0, 3):  # Current line + next 2 lines
                    next_idx = i + offset
                    if next_idx < len(lines):
                        combined_line = lines[next_idx]
                        if (price_regex.search(combined_line) or
                            pages_regex.search(combined_line) or
                            publisher_year_regex.search(combined_line)):
                            valid_match = True
                            break
                
                if valid_match:  # Trigger skip if conditions are met
                    skip_flag = True
                    identified_lines.append(line.strip())
                    break

            # Step 3: Check for Author line and validate with price/pages/publisher
            if author_regex.match(line):
                valid_match = False
                for offset in range(0, 2):  # Current line + next line
                    next_idx = i + offset
                    if next_idx < len(lines):
                        combined_line = lines[next_idx]
                        if (price_regex.search(combined_line) or
                            pages_regex.search(combined_line) or
                            publisher_year_regex.search(combined_line)):
                            valid_match = True
                            break

                if valid_match:
                    skip_flag = True
                    identified_lines.append(line.strip())
                    break

        # Append lines if no skip condition is triggered
        if not skip_flag:
            output_lines.append(line)

    # Write cleaned content to the output file
    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines(output_lines)

    return identified_lines  # Return all identified lines for reporting

# Track identified lines and files with removals
files_with_removals = 0
all_removed_lines = {}

# Process all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        input_file_path = os.path.join(input_folder, filename)
        output_file_path = os.path.join(output_folder, filename)

        removed_lines = process_file(input_file_path, output_file_path, filename)
        if removed_lines:
            files_with_removals += 1
            all_removed_lines[filename] = removed_lines

# Final summary
print(f"Processing complete. Cleaned files are saved in '{output_folder}'.")
print(f"Number of files where lines were identified and removed: {files_with_removals}\n")

# Print all identified lines
print("Identified lines removed from each file:")
for file, lines in all_removed_lines.items():
    print(f"\nFile: {file}")
    for line in lines:
        print(f" - {line}")


Processing complete. Cleaned files are saved in 'Reviews Processed'.
Number of files where lines were identified and removed: 46

Identified lines removed from each file:

File: 41_(02)_1987_004.txt
 - Bible, adaptations of. The Book ofAdam to Moses; ad by Lore Segal; illus. by Leonard

File: 41_(04)_1987_033.txt
 - John Langstaff and John Andrew Ross; illus. by Ashley Bryan. Margaret K.

File: 41_(04)_1987_043.txt
 - Mahy, Margaret. 17 Kings and 42 Elephants; illus. by Patricia MacCarthy. Dial, 1987.

File: 41_(05)_1988_063.txt
 - Sheridan, Sybil. Stories from the Jewish World; illus by Robert Geary. Silver Burdett,

File: 41_(06)_1988_020.txt
 - Fleischman, Paul. Joyful Noise: Poems for Two Voices; illus. by Eric Beddows. Harper,

File: 41_(09)_1988_051.txt
 - Shusterman, Neal. The Shadow Club. Little, 1988. ISBN 0-316-77540-1.

File: 42_(02)_1988_082.txt
 - Lowry, Lois. All About Sam; illus. by Diane deGroat. Houghton, 1988.

File: 42_(03)_1988_057.txt
 - Burton, Marilee Robin. Tail

#### Second pass with publisher/price/pages

In [81]:
# Define input and output folders
input_folder = "Reviews Processed"  # Input folder from first pass
output_folder = "Reviews Processed1"  # Output folder for second pass

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Regex patterns for price, pages, and publisher
price_regex = re.compile(r"(\$\d+\.?\d*)")
pages_regex = re.compile(r"\b(\d{1,3}p)\b")
publisher_year_regex = re.compile(r"\.\s+([^.,]+),\s(\d{4})")

# Function to check for valid combos
def has_valid_combo(lines, start_idx):
    conditions = {"price": False, "pages": False, "publisher": False}

    for offset in range(0, 3):  # Check current line + next 2 lines
        if start_idx + offset < len(lines):
            combined_line = lines[start_idx + offset]
            if price_regex.search(combined_line):
                conditions["price"] = True
            if pages_regex.search(combined_line):
                conditions["pages"] = True
            if publisher_year_regex.search(combined_line):
                conditions["publisher"] = True
        if sum(conditions.values()) >= 2:
            return True
    return False

# Function to process files
def process_file(input_path, output_path, filename):
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    output_lines = []
    skip_flag = False
    identified_lines = []

    for i, line in enumerate(lines):
        if i < 4:  # Keep the first 4 lines as they are
            output_lines.append(line)
            continue

        if not skip_flag:
            if (price_regex.search(line) or pages_regex.search(line) or publisher_year_regex.search(line)):
                if has_valid_combo(lines, i):
                    skip_flag = True
                    identified_lines.append(line.strip())
                    break

        if not skip_flag:
            output_lines.append(line)

    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines(output_lines)

    return identified_lines

# Second Pass Execution
files_with_removals = 0
all_removed_lines = {}

for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        input_file_path = os.path.join(input_folder, filename)
        output_file_path = os.path.join(output_folder, filename)

        removed_lines = process_file(input_file_path, output_file_path, filename)
        if removed_lines:
            files_with_removals += 1
            all_removed_lines[filename] = removed_lines

print("Second Pass Complete. Files saved to 'Reviews Processed1'.")
print(f"Files cleaned in Second Pass: {files_with_removals}\n")

# Print identified lines removed for each file
print("Identified lines removed from each file:")
for file, lines in all_removed_lines.items():
    print(f"\nFile: {file}")
    for line in lines:
        print(f" - {line}")

Second Pass Complete. Files saved to 'Reviews Processed1'.
Files cleaned in Second Pass: 13

Identified lines removed from each file:

File: 42_(01)_1988_050.txt
 - 30p. (First Experiences).

File: 42_(02)_1988_082.txt
 - 143p.

File: 43_(03)_1989_022.txt
 - ISBN 0-8172-2916-7 32p. illus. (Raintree Hispanic Stories). Library ed.

File: 43_(04)_1989_011.txt
 - 32p. illus. and with photographs. $11.90.

File: 43_(10)_1990_039.txt
 - 0-531-10896-1. 64p. illus. with photographs. (First Books). $10.90.

File: 44_(03)_1990_068.txt
 - Avi. The True Confessions of Charlotte Doyle. Jackson/Orchard, 1990.

File: 44_(06)_1991_038.txt
 - 4. [40p.] illus. with photographs. $14.95. Reviewed from galleys.

File: 44_(08)_1991_036.txt
 - Lawrence Migdale. Holiday House, 1991. ISBN 0-8234-0864-7. 32p. $14.95.

File: 45_(02)_1991_053.txt
 - World. Bradbury, 1991. ISBN 0-02-778072-4. [64p.] illus. with photographs.

File: 45_(03)_1991_010.txt
 - 30p.

File: 45_(04)_1991_033.txt
 - 161p. $14.95.

File: 45_

### Remove lines with $ symbol, specifically those containing "Library" or "Trade" or "Paperback" or "Hardbound"

In [83]:
# Define input and output folders
input_folder = "Reviews Processed1"  # Folder containing processed review files
output_folder = "Reviews Processed2"  # Folder to save further processed reviews

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Patterns
keywords = ["library", "trade", "paperback", "hardbound"]
price_pattern = re.compile(r"\$\d+\.?\d*", re.IGNORECASE)  # Match any $ value like $12.15
ed_price_pattern = re.compile(r"\bed\.,?\s*\$\d+\.?\d*", re.IGNORECASE)  # Match "ed., $2.50"
edition_price_pattern = re.compile(r"\bedition,?\s*\$\d+\.?\d*", re.IGNORECASE)  # Match "edition, $2.50"
net_price_pattern = re.compile(r"\$\d+\.?\d*\s*net\.?", re.IGNORECASE)  # Match "$2.63 net."
grade_years_regex = re.compile(r"^(\d{1,2}-\d{1,2}|[A-Z]-\d{1,2}|.*(?:Gr\.|yrs\.).*)$")  # Match grade years

# Function to check for keyword and price combination
def contains_keyword_and_price(line):
    lower_line = line.lower()  # Case-insensitive keyword search
    if any(keyword in lower_line for keyword in keywords) and price_pattern.search(line):
        return True
    return False

# Function to check for "ed., $<amount>" pattern
def contains_ed_price(line):
    return bool(ed_price_pattern.search(line))

# Function to check for "edition, $<amount>" pattern
def contains_edition_price(line):
    return bool(edition_price_pattern.search(line))

# Function to check for "$<amount> net" pattern
def contains_net_price(line):
    return bool(net_price_pattern.search(line))

# Function to check if a line ONLY contains a number or price
def contains_only_number_or_price(line):
    stripped_line = line.strip()
    return bool(re.fullmatch(r"\d+|\$\d+\.?\d*", stripped_line))

# Function to check if a line ONLY contains grade years
def contains_only_grade_years(line):
    return bool(grade_years_regex.fullmatch(line.strip()))

# Function to process a single file
def process_file(input_path, output_path, filename):
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []
    flagged_cases = []  # To record lines removed for debugging

    for idx, line in enumerate(lines):
        # Check conditions to remove lines
        if (contains_keyword_and_price(line) or contains_ed_price(line) or
            contains_edition_price(line) or contains_net_price(line) or
            contains_only_number_or_price(line) or contains_only_grade_years(line)):
            
            # Flag the line for debugging if not the first line
            if idx > 0:
                flagged_cases.append((filename, idx + 1, line.strip()))
            continue  # Skip this line (remove it)

        # Keep the line if no condition is met
        cleaned_lines.append(line)

    # Write cleaned content to the output file (remove blank lines)
    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines([line for line in cleaned_lines if line.strip()])

    # Print flagged cases
    for case in flagged_cases:
        print(f"File: {case[0]} | Line {case[1]}: {case[2]}")

# Process all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        input_file_path = os.path.join(input_folder, filename)
        output_file_path = os.path.join(output_folder, filename)

        process_file(input_file_path, output_file_path, filename)

print("Processing complete. Cleaned files are saved in 'Reviews Processed2'.")

File: 41_(01)_1987_013.txt | Line 3: (Stepping Stone). Library ed. $5.99; Paper ed. $1.95. Reviewed from galleys.
File: 41_(01)_1987_033.txt | Line 2: Trade ed. $11.95. Reviewed from galleys.
File: 41_(01)_1987_048.txt | Line 2: 4-7 yrs.
File: 41_(01)_1987_050.txt | Line 2: Trade ed. ISBN 0-531-05709-7. Library ed. $11.99; Trade ed. $11.95. Reviewed
File: 41_(01)_1987_054.txt | Line 2: $11.85; Trade ed. $10.95.
File: 41_(01)_1987_055.txt | Line 2: Library ed. $11.85; Trade ed. $10.95.
File: 41_(02)_1987_003.txt | Line 2: ed. $11.95. Reviewed from galleys.
File: 41_(02)_1987_010.txt | Line 2: Library ed. $12.89; Trade ed. $12.95.
File: 41_(02)_1987_012.txt | Line 2: Library ed. $12.89; Trade ed. $12.95. Reviewed from galleys.
File: 41_(02)_1987_013.txt | Line 2: ed. $10.95. Reviewed from galleys.
File: 41_(02)_1987_017.txt | Line 2: Library ed. $15.85; Trade ed.
File: 41_(02)_1987_019.txt | Line 2: Library ed. $11.89; Trade ed. $11.95. Reviewed from galleys.
File: 41_(02)_1987_028.txt |

### Remove lines containing only review codes or price

coded_symbols = ["R*", "R", "Ad", "M", "NR", "SpC", "SpR"]

In [84]:
# Define the input and output folders
input_folder = "Reviews Processed2"  # Input folder
output_folder = "Reviews Processed3"  # Output folder for cleaned files

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# List of coded symbols to identify and remove
coded_symbols = ["R*", "R", "Ad", "M", "NR", "SpC", "SpR"]

# Updated regex pattern to match lines that contain ONLY a price with optional punctuation
price_regex = re.compile(r"^\s*\$\d+(\.\d{2})?[\s\.\!]*$")  # Matches "$3.25.", "$10", etc.

# Track updated files and unchanged files
updated_files = []
unchanged_files = []

# Iterate through all review files in the input folder
for review_file in os.listdir(input_folder):
    # Construct the full path to the input and output files
    input_file_path = os.path.join(input_folder, review_file)
    output_file_path = os.path.join(output_folder, review_file)
    
    # Check if it's a valid text file
    if os.path.isfile(input_file_path) and review_file.endswith(".txt"):
        with open(input_file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        
        # Filter out lines with ONLY coded symbols or ONLY a price
        cleaned_lines = [
            line for line in lines
            if line.strip() not in coded_symbols and not price_regex.match(line)
        ]

        # Check if any lines were removed
        if len(cleaned_lines) != len(lines):
            # Save the cleaned text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.writelines(cleaned_lines)
            
            # Add to the list of updated files
            updated_files.append(review_file)
        else:
            # Save the unchanged file as is in the output folder
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.writelines(lines)
            
            # Add to the list of unchanged files
            unchanged_files.append(review_file)

# Print final summary
print("\n--- Processing Summary ---")
print(f"Total files updated: {len(updated_files)}")
print(f"Total files unchanged: {len(unchanged_files)}")
if updated_files:
    print("\nFiles where deletions were made:")
    for file in updated_files:
        print(f" - {file}")
if unchanged_files:
    print("\nFiles saved without changes:")
    for file in unchanged_files:
        print(f" - {file}")


--- Processing Summary ---
Total files updated: 3709
Total files unchanged: 79

Files where deletions were made:
 - 41_(01)_1987_001.txt
 - 41_(01)_1987_002.txt
 - 41_(01)_1987_003.txt
 - 41_(01)_1987_004.txt
 - 41_(01)_1987_005.txt
 - 41_(01)_1987_006.txt
 - 41_(01)_1987_007.txt
 - 41_(01)_1987_008.txt
 - 41_(01)_1987_009.txt
 - 41_(01)_1987_010.txt
 - 41_(01)_1987_011.txt
 - 41_(01)_1987_012.txt
 - 41_(01)_1987_013.txt
 - 41_(01)_1987_014.txt
 - 41_(01)_1987_015.txt
 - 41_(01)_1987_016.txt
 - 41_(01)_1987_017.txt
 - 41_(01)_1987_018.txt
 - 41_(01)_1987_019.txt
 - 41_(01)_1987_020.txt
 - 41_(01)_1987_021.txt
 - 41_(01)_1987_022.txt
 - 41_(01)_1987_023.txt
 - 41_(01)_1987_024.txt
 - 41_(01)_1987_025.txt
 - 41_(01)_1987_026.txt
 - 41_(01)_1987_027.txt
 - 41_(01)_1987_028.txt
 - 41_(01)_1987_029.txt
 - 41_(01)_1987_030.txt
 - 41_(01)_1987_031.txt
 - 41_(01)_1987_032.txt
 - 41_(01)_1987_033.txt
 - 41_(01)_1987_034.txt
 - 41_(01)_1987_035.txt
 - 41_(01)_1987_036.txt
 - 41_(01)_1987_037.tx

### Remove reviewer abbreviations

In [85]:
# Define input and output folders
input_folder = "Reviews Processed3"  # Input folder containing review files
output_folder = "Reviews Processed4"  # Output folder for cleaned files
metadata_file = "Chunk 6 IDs Unique.xlsx"  # Excel file with metadata

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the metadata Excel file
df = pd.read_excel(metadata_file, dtype=str)

# Regex to match a 2-3 uppercase abbreviation at the end of the last line
abbreviation_regex = re.compile(r"[A-Z]{2,3}\.?$")
# Regex to match reviewer abbreviation anywhere in a line
reviewer_regex_template = r"\b{}\b"

# Track updated files
updated_files = []

# Function to clean reviewer abbreviation from lines
def remove_reviewer(lines, reviewer_abbreviation):
    updated_lines = []
    reviewer_removed = False
    reviewer_regex = re.compile(reviewer_regex_template.format(re.escape(reviewer_abbreviation)))

    for line in lines:
        if not reviewer_removed and reviewer_regex.search(line):
            line = reviewer_regex.sub("", line)  # Remove the abbreviation
            reviewer_removed = True  # Ensure only one match is removed
        updated_lines.append(line)
    return updated_lines, reviewer_removed

# Process each file in the folder
for review_file in os.listdir(input_folder):
    # Construct the full path to the input and output files
    input_file_path = os.path.join(input_folder, review_file)
    output_file_path = os.path.join(output_folder, review_file)
    
    # Check if it's a valid text file
    if os.path.isfile(input_file_path) and review_file.endswith(".txt"):
        with open(input_file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        # Ensure only non-empty lines are considered
        non_empty_lines = [line.strip() for line in lines if line.strip()]
        if not non_empty_lines:
            # If the file is empty or only whitespace, save it as-is
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.writelines(lines)
            continue

        # Extract the last non-empty line
        last_line = non_empty_lines[-1]

        # Check if the last line contains a 2-3 uppercase abbreviation at the end
        if abbreviation_regex.search(last_line[-5:]):
            # Remove the abbreviation from the last line
            cleaned_last_line = abbreviation_regex.sub("", last_line).rstrip()

            # Replace the last non-empty line in the original lines
            for i in range(len(lines) - 1, -1, -1):
                if lines[i].strip():  # Find the last non-empty line in the original list
                    lines[i] = cleaned_last_line + "\n"
                    break

        # Extract the corresponding Reviewer value from the metadata file
        file_id = os.path.splitext(review_file)[0]  # Extract ID from filename
        matching_row = df[df['ID'] == file_id]

        reviewer_removed = False
        if not matching_row.empty and pd.notna(matching_row.iloc[0]['Reviewer']):
            reviewer_abbreviation = matching_row.iloc[0]['Reviewer'].strip()
            # Remove reviewer abbreviation from lines
            lines, reviewer_removed = remove_reviewer(lines, reviewer_abbreviation)

        # Save the updated file
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.writelines(lines)

        # Track the file if any modification was made
        if abbreviation_regex.search(last_line[-5:]) or reviewer_removed:
            updated_files.append(review_file)

# Print final summary
print("\n--- Processing Summary ---")
print(f"Total files updated (abbreviations or reviewers removed): {len(updated_files)}")
if updated_files:
    print("\nFiles with modifications:")
    for file in updated_files:
        print(f" - {file}")
else:
    print("No files had abbreviations or reviewers removed.")



--- Processing Summary ---
Total files updated (abbreviations or reviewers removed): 3703

Files with modifications:
 - 41_(01)_1987_001.txt
 - 41_(01)_1987_002.txt
 - 41_(01)_1987_003.txt
 - 41_(01)_1987_004.txt
 - 41_(01)_1987_005.txt
 - 41_(01)_1987_006.txt
 - 41_(01)_1987_007.txt
 - 41_(01)_1987_008.txt
 - 41_(01)_1987_009.txt
 - 41_(01)_1987_010.txt
 - 41_(01)_1987_011.txt
 - 41_(01)_1987_012.txt
 - 41_(01)_1987_013.txt
 - 41_(01)_1987_014.txt
 - 41_(01)_1987_015.txt
 - 41_(01)_1987_016.txt
 - 41_(01)_1987_017.txt
 - 41_(01)_1987_018.txt
 - 41_(01)_1987_019.txt
 - 41_(01)_1987_020.txt
 - 41_(01)_1987_021.txt
 - 41_(01)_1987_022.txt
 - 41_(01)_1987_023.txt
 - 41_(01)_1987_024.txt
 - 41_(01)_1987_025.txt
 - 41_(01)_1987_026.txt
 - 41_(01)_1987_027.txt
 - 41_(01)_1987_028.txt
 - 41_(01)_1987_029.txt
 - 41_(01)_1987_030.txt
 - 41_(01)_1987_031.txt
 - 41_(01)_1987_032.txt
 - 41_(01)_1987_033.txt
 - 41_(01)_1987_034.txt
 - 41_(01)_1987_035.txt
 - 41_(01)_1987_036.txt
 - 41_(01)_1987_03

### Removing lines which ONLY say "Reviewed from galleys"

In [86]:
# Define the input and output folders
input_folder = "Reviews Processed4"  # Input folder
output_folder = "Reviews Processed5"  # Output folder for cleaned files

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Regex to match a line that says "Reviewed from galleys" (case-insensitive, with optional punctuation)
reviewed_regex = re.compile(r"^\s*reviewed from galleys[\.\!\?]*\s*$", re.IGNORECASE)

# Track updated files
updated_files = []

# Process each file in the folder
for review_file in os.listdir(input_folder):
    # Construct the full path to the input and output files
    input_file_path = os.path.join(input_folder, review_file)
    output_file_path = os.path.join(output_folder, review_file)
    
    # Check if it's a valid text file
    if os.path.isfile(input_file_path) and review_file.endswith(".txt"):
        with open(input_file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        # Filter out lines that match the regex
        cleaned_lines = [line for line in lines if not reviewed_regex.match(line.strip())]

        # Check if any lines were removed
        if len(cleaned_lines) != len(lines):
            # Save the cleaned text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.writelines(cleaned_lines)
            
            # Add to the list of updated files
            updated_files.append(review_file)
        else:
            # Save the unchanged file as is in the output folder
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.writelines(lines)

# Print final summary
print("\n--- Processing Summary ---")
print(f"Total files updated (lines removed): {len(updated_files)}")
if updated_files:
    print("\nFiles with removed lines:")
    for file in updated_files:
        print(f" - {file}")
else:
    print("No files had matching lines removed.")


--- Processing Summary ---
Total files updated (lines removed): 159

Files with removed lines:
 - 41_(01)_1987_006.txt
 - 41_(01)_1987_008.txt
 - 41_(01)_1987_016.txt
 - 41_(01)_1987_018.txt
 - 41_(01)_1987_019.txt
 - 41_(01)_1987_030.txt
 - 41_(01)_1987_058.txt
 - 41_(02)_1987_004.txt
 - 41_(02)_1987_010.txt
 - 41_(02)_1987_014.txt
 - 41_(02)_1987_016.txt
 - 41_(02)_1987_024.txt
 - 41_(02)_1987_025.txt
 - 41_(02)_1987_027.txt
 - 41_(02)_1987_032.txt
 - 41_(02)_1987_042.txt
 - 41_(02)_1987_047.txt
 - 41_(02)_1987_052.txt
 - 41_(02)_1987_055.txt
 - 41_(02)_1987_059.txt
 - 41_(03)_1987_004.txt
 - 41_(03)_1987_012.txt
 - 41_(03)_1987_021.txt
 - 41_(03)_1987_051.txt
 - 41_(03)_1987_055.txt
 - 41_(03)_1987_056.txt
 - 41_(03)_1987_061.txt
 - 41_(04)_1987_023.txt
 - 41_(05)_1988_017.txt
 - 41_(05)_1988_038.txt
 - 41_(05)_1988_055.txt
 - 41_(05)_1988_056.txt
 - 41_(05)_1988_081.txt
 - 41_(06)_1988_005.txt
 - 41_(06)_1988_047.txt
 - 41_(06)_1988_052.txt
 - 41_(06)_1988_060.txt
 - 41_(06)_1988_

### Removed lines with "reviewed from galleys" followed by a price

In [87]:
import os
import re

# Define input and output folders
input_folder = "Reviews Processed5"  # Input folder
output_folder = "Reviews Processed6"  # Output folder

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Partial match criteria (lowercase for case-insensitivity)
search_phrase = "reviewed from galleys"
price_symbol = "$"

# Function to process a single file
def process_file(input_path, output_path, filename):
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    cleaned_lines = []  # Store cleaned lines
    removed_lines = []  # Track removed lines
    
    # Process each line
    for line in lines:
        # Convert line to lowercase for case-insensitive comparison
        line_lower = line.lower()
        if search_phrase in line_lower and price_symbol in line_lower:
            removed_lines.append(line.strip())  # Log the removed line
            continue  # Skip this line
        cleaned_lines.append(line)
    
    # Write the cleaned lines to the output file
    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines(cleaned_lines)

    return removed_lines

# Main loop to process all files
files_with_removals = 0
all_removed_lines = {}

for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        input_file_path = os.path.join(input_folder, filename)
        output_file_path = os.path.join(output_folder, filename)
        
        removed_lines = process_file(input_file_path, output_file_path, filename)
        if removed_lines:
            files_with_removals += 1
            all_removed_lines[filename] = removed_lines
            print(f"Processed file: {filename} | Removed lines: {len(removed_lines)}")

# Final summary
print("\n--- Processing Summary ---")
print(f"Total files with removals: {files_with_removals}")
if files_with_removals:
    print("\nDetails of Removed Lines:")
    for file, lines in all_removed_lines.items():
        print(f"\nFile: {file}")
        for line in lines:
            print(f" - {line}")

Processed file: 41_(01)_1987_009.txt | Removed lines: 1
Processed file: 41_(01)_1987_032.txt | Removed lines: 1
Processed file: 41_(02)_1987_017.txt | Removed lines: 1
Processed file: 41_(02)_1987_023.txt | Removed lines: 1
Processed file: 41_(04)_1987_050.txt | Removed lines: 1
Processed file: 41_(05)_1988_002.txt | Removed lines: 1
Processed file: 41_(06)_1988_018.txt | Removed lines: 1
Processed file: 41_(06)_1988_026.txt | Removed lines: 1
Processed file: 41_(08)_1988_058.txt | Removed lines: 1
Processed file: 41_(08)_1988_060.txt | Removed lines: 1
Processed file: 41_(09)_1988_065.txt | Removed lines: 1
Processed file: 41_(11)_1988_068.txt | Removed lines: 1
Processed file: 42_(01)_1988_010.txt | Removed lines: 1
Processed file: 42_(01)_1988_011.txt | Removed lines: 1
Processed file: 42_(01)_1988_031.txt | Removed lines: 1
Processed file: 42_(01)_1988_053.txt | Removed lines: 1
Processed file: 42_(02)_1988_012.txt | Removed lines: 1
Processed file: 42_(02)_1988_014.txt | Removed l

### Remove lines which contain "Illustrator" information

In [88]:
import os
import re

# Define input and output folders
input_folder = "Reviews Processed6"  # Input folder containing review files
output_folder = "Reviews Processed7"  # Output folder for cleaned files

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Patterns to match
patterns = [
    r"illus\. by",         # Matches 'illus. by'
    r"trade ed\.",         # Matches 'Trade ed.'
    r"library ed\.",       # Matches 'Library ed.'
    r"paper ed\."          # Matches 'Paper ed.'
]

# Combine patterns into a single regex (case-insensitive)
combined_pattern = re.compile("|".join(patterns), re.IGNORECASE)

# Track updated files
updated_files = []

# Function to process files
def process_file(input_path, output_path):
    updated = False
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []
    for line in lines:
        # If a line matches the pattern, skip it
        if combined_pattern.search(line.lower()):
            updated = True  # Mark as updated
            continue
        cleaned_lines.append(line)

    # Save the cleaned file
    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines(cleaned_lines)

    return updated

# Process each file
for review_file in os.listdir(input_folder):
    input_file_path = os.path.join(input_folder, review_file)
    output_file_path = os.path.join(output_folder, review_file)

    # Check if it's a valid text file
    if os.path.isfile(input_file_path) and review_file.endswith(".txt"):
        if process_file(input_file_path, output_file_path):
            updated_files.append(review_file)
        else:
            # Save original file if no updates were made
            with open(input_file_path, 'r', encoding='utf-8') as infile:
                with open(output_file_path, 'w', encoding='utf-8') as outfile:
                    outfile.writelines(infile.readlines())

# Print final summary
print("\n--- Processing Summary ---")
print(f"Total files updated (lines removed): {len(updated_files)}")
if updated_files:
    print("\nFiles with lines removed:")
    for file in updated_files:
        print(f" - {file}")
else:
    print("No files had lines removed.")


--- Processing Summary ---
Total files updated (lines removed): 1013

Files with lines removed:
 - 41_(01)_1987_007.txt
 - 41_(01)_1987_010.txt
 - 41_(01)_1987_011.txt
 - 41_(01)_1987_013.txt
 - 41_(01)_1987_022.txt
 - 41_(01)_1987_023.txt
 - 41_(01)_1987_029.txt
 - 41_(01)_1987_031.txt
 - 41_(01)_1987_048.txt
 - 41_(01)_1987_050.txt
 - 41_(01)_1987_055.txt
 - 41_(01)_1987_059.txt
 - 41_(01)_1987_063.txt
 - 41_(02)_1987_010.txt
 - 41_(02)_1987_012.txt
 - 41_(02)_1987_017.txt
 - 41_(02)_1987_022.txt
 - 41_(02)_1987_024.txt
 - 41_(02)_1987_031.txt
 - 41_(02)_1987_036.txt
 - 41_(02)_1987_043.txt
 - 41_(02)_1987_046.txt
 - 41_(02)_1987_049.txt
 - 41_(02)_1987_051.txt
 - 41_(02)_1987_058.txt
 - 41_(02)_1987_061.txt
 - 41_(02)_1987_062.txt
 - 41_(02)_1987_064.txt
 - 41_(03)_1987_006.txt
 - 41_(03)_1987_008.txt
 - 41_(03)_1987_012.txt
 - 41_(03)_1987_013.txt
 - 41_(03)_1987_014.txt
 - 41_(03)_1987_015.txt
 - 41_(03)_1987_019.txt
 - 41_(03)_1987_020.txt
 - 41_(03)_1987_025.txt
 - 41_(03)_1987

### Remove lines which contain values from both Author and Title column

In [89]:
# Define input and output folders
input_folder = "Reviews Processed7"  # Input folder containing review files
output_folder = "Reviews Processed8"  # Output folder for cleaned files
metadata_file = "Chunk 6 IDs Unique.xlsx"  # Excel file with metadata

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the metadata Excel file
df = pd.read_excel(metadata_file, dtype=str)

# Helper function to clean and normalize text
def clean_text(text):
    return re.sub(r'\s+', ' ', str(text).strip()).lower()

# Function to check if both Author and Title are in the line
def contains_author_and_title(line, author, title):
    return clean_text(author) in clean_text(line) and clean_text(title) in clean_text(line)

# Track files with lines removed
updated_files = []

# Process each file in the input folder
for review_file in os.listdir(input_folder):
    # Construct the full path to the input and output files
    input_file_path = os.path.join(input_folder, review_file)
    output_file_path = os.path.join(output_folder, review_file)

    # Check if it's a valid text file
    if os.path.isfile(input_file_path) and review_file.endswith(".txt"):
        with open(input_file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        # Extract the corresponding metadata for this file
        file_id = os.path.splitext(review_file)[0]  # Extract ID from filename
        matching_row = df[df['ID'] == file_id]

        if matching_row.empty:
            # If no metadata is found, save the file unchanged
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.writelines(lines)
            continue

        # Get Author and Title values
        author = matching_row.iloc[0]['Author']
        title = matching_row.iloc[0]['Title']

        # Clean the file by removing lines with both Author and Title
        cleaned_lines = []
        line_removed = False

        for line in lines:
            if contains_author_and_title(line, author, title):
                line_removed = True  # Mark that a line was removed
                continue  # Skip this line
            cleaned_lines.append(line)

        # Save the cleaned file
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.writelines(cleaned_lines)

        # Track the file if lines were removed
        if line_removed:
            updated_files.append(review_file)

# Print final summary
print("\n--- Processing Summary ---")
print(f"Total files updated (lines with Author and Title removed): {len(updated_files)}")
if updated_files:
    print("\nFiles with modifications:")
    for file in updated_files:
        print(f" - {file}")
else:
    print("No files had lines removed.")


--- Processing Summary ---
Total files updated (lines with Author and Title removed): 484

Files with modifications:
 - 41_(01)_1987_027.txt
 - 41_(01)_1987_032.txt
 - 41_(01)_1987_042.txt
 - 41_(01)_1987_045.txt
 - 41_(01)_1987_049.txt
 - 41_(01)_1987_054.txt
 - 41_(02)_1987_002.txt
 - 41_(02)_1987_019.txt
 - 41_(02)_1987_020.txt
 - 41_(02)_1987_026.txt
 - 41_(02)_1987_029.txt
 - 41_(02)_1987_035.txt
 - 41_(03)_1987_028.txt
 - 41_(03)_1987_035.txt
 - 41_(03)_1987_041.txt
 - 41_(03)_1987_052.txt
 - 41_(04)_1987_026.txt
 - 41_(04)_1987_047.txt
 - 41_(04)_1987_049.txt
 - 41_(04)_1987_052.txt
 - 41_(05)_1988_014.txt
 - 41_(05)_1988_027.txt
 - 41_(05)_1988_030.txt
 - 41_(05)_1988_045.txt
 - 41_(05)_1988_068.txt
 - 41_(05)_1988_077.txt
 - 41_(05)_1988_082.txt
 - 41_(06)_1988_035.txt
 - 41_(06)_1988_053.txt
 - 41_(06)_1988_062.txt
 - 41_(07)_1988_006.txt
 - 41_(07)_1988_007.txt
 - 41_(07)_1988_024.txt
 - 41_(07)_1988_032.txt
 - 41_(07)_1988_034.txt
 - 41_(07)_1988_040.txt
 - 41_(07)_1988_04

# Data Validation

## Count the number of "$" present in review texts

In [90]:
import os

# Define the folder containing the review files
reviews_folder = "Reviews Processed8"

# Initialize a list to store file names with the $ symbol
files_with_dollar = []

# Iterate through all review files in the folder
for review_file in os.listdir(reviews_folder):
    # Construct the full path to the review file
    file_path = os.path.join(reviews_folder, review_file)
    
    # Check if it's a valid text file
    if os.path.isfile(file_path) and review_file.endswith(".txt"):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Check if the $ symbol is present in the content
        if "$" in content:
            files_with_dollar.append(review_file)

# Print the results
print(f"Number of files containing the '$' symbol: {len(files_with_dollar)}")
print("\nFiles containing the '$' symbol:")
for file_name in files_with_dollar:
    print(file_name)

Number of files containing the '$' symbol: 43

Files containing the '$' symbol:
41_(02)_1987_027.txt
41_(03)_1987_010.txt
41_(04)_1987_043.txt
41_(05)_1988_059.txt
41_(06)_1988_020.txt
41_(07)_1988_027.txt
41_(07)_1988_057.txt
41_(09)_1988_046.txt
41_(09)_1988_051.txt
42_(01)_1988_060.txt
42_(02)_1988_082.txt
42_(03)_1988_057.txt
42_(04)_1988_018.txt
42_(05)_1989_019.txt
42_(07)_1989_029.txt
42_(09)_1989_026.txt
42_(09)_1989_032.txt
42_(09)_1989_039.txt
42_(10)_1989_010.txt
42_(11)_1989_042.txt
43_(04)_1989_052.txt
43_(05)_1990_035.txt
43_(05)_1990_054.txt
43_(06)_1990_043.txt
43_(06)_1990_044.txt
43_(10)_1990_028.txt
43_(10)_1990_068.txt
44_(03)_1990_009.txt
44_(04)_1990_061.txt
44_(05)_1991_019.txt
44_(05)_1991_050.txt
44_(06)_1991_049.txt
44_(07)_1991_034.txt
44_(09)_1991_007.txt
44_(10)_1991_042.txt
44_(11)_1991_059.txt
45_(02)_1991_037.txt
45_(02)_1991_052.txt
45_(05)_1992_002.txt
45_(07)_1992_032.txt
45_(09)_1992_018.txt
45_(09)_1992_019.txt
45_(10)_1992_062.txt


## Count the number of empty review texts

In [82]:
import os

# Define input folder containing review text files
input_folder = "Reviews Processed1"  # Replace with your folder name

# List to store names of empty files
empty_files = []

# Process each file in the folder
for filename in os.listdir(input_folder):
    file_path = os.path.join(input_folder, filename)
    
    # Check if it's a valid text file
    if os.path.isfile(file_path) and filename.endswith(".txt"):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read().strip()  # Read file and strip whitespace
        
        # Check if file is empty (no content or only whitespace)
        if not content:
            empty_files.append(filename)

# Print the results
print("\n--- Empty Files Summary ---")
print(f"Total empty files: {len(empty_files)}")
if empty_files:
    print("\nList of empty files:")
    for file in empty_files:
        print(f" - {file}")
else:
    print("No empty files found.")


--- Empty Files Summary ---
Total empty files: 0
No empty files found.
