In [None]:
!pip install pypdf python-docx

In [2]:
import os
import json
from pypdf import PdfReader
from docx import Document

In [9]:
INPUT_DIR = "E:\\YuyangGPT\\dataset\\raw_data"      # folder with .pdf and .docx files
OUTPUT_FILE = "E:\\YuyangGPT\\dataset\\cleaned_data\\all_data.jsonl"  # combined output file

In [4]:
def extract_text_from_pdf(path):
    reader = PdfReader(path)
    pages = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            pages.append(text)
    return "\n".join(pages)

def extract_text_from_docx(path):
    doc = Document(path)
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    return "\n".join(paragraphs)

def clean_text(text):
    text = text.replace("\r", "\n")
    text = "\n".join(
        line.strip()
        for line in text.split("\n")
        if line.strip()
    )
    return text

def chunk_text(text, min_len=50):
    chunks = []
    for paragraph in text.split("\n"):
        if len(paragraph) >= min_len:
            chunks.append(paragraph)
    return chunks


In [None]:
with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:
    for filename in os.listdir(INPUT_DIR):
        if not filename.endswith((".pdf", ".docx")):
            continue

        path = os.path.join(INPUT_DIR, filename)
        print(f"Processing {filename}...")

        try:
            if filename.endswith(".pdf"):
                text = extract_text_from_pdf(path)
            else:
                text = extract_text_from_docx(path)

            text = clean_text(text)
            chunks = chunk_text(text)

            for chunk in chunks:
                record = {"text": chunk}
                out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

        except Exception as e:
            print(f"Error processing {filename}: {e}")


Processing 2_8 Interview Prep.docx...
Processing BAC Consulting Presentation Notes.docx...
Processing Brochure.docx...
Processing Civics Politics Writeup.docx...
Processing College Leader Interview Prep.docx...
Processing Danish Visa Cover Letter.docx...
Processing earth day speech!.docx...
Processing EASU script.docx...
Processing empire book.docx...
Processing English Narrative.docx...
Processing Exile and Migration Final Project.docx...
Processing Final Project Proposal.docx...
Processing First Year Seminar Notes.docx...
Processing NYU Shanghai Personal Statement.docx...
Processing Oedipus Rex.docx...
Processing Progression 1 .docx...
Processing Recitation 5.docx...
Processing Recitation 7.docx...
Processing recitation assignment 9.docx...
Processing Samantha LinkedIn Message.docx...
Processing Script Outline.docx...
Processing Seminar Transcript Assignment.docx...
Processing Summer AI Integration Project.docx...
Processing task_5.docx...
Processing Texts & Ideas Presentation.docx..