In [None]:
# Required packages
!pip install pdf2image
!pip install python-docx
!pip install pytesseract
!sudo apt-get install tesseract-ocr
!sudo apt-get install poppler-utils

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The

In [None]:
import os
import pytesseract
from pdf2image import convert_from_path
from docx import Document
from PIL import Image
import pandas as pd
import logging
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class CVTextExtractor:
    def __init__(self, input_folder, output_folder):
        self.input_folder = input_folder
        self.output_folder = output_folder

        # Create output folder if it doesn't exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        # Setup logging
        logging.basicConfig(
            filename=os.path.join(output_folder, 'extraction_log.txt'),
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )

    def extract_from_pdf(self, pdf_path):
        try:
            images = convert_from_path(pdf_path)
            text = ""

            for image in images:
                text += pytesseract.image_to_string(image)

            return text.strip()
        except Exception as e:
            logging.error(f"Error processing PDF {pdf_path}: {str(e)}")
            return None

    def extract_from_docx(self, docx_path):
        try:
            doc = Document(docx_path)
            text = ""

            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"

            return text.strip()
        except Exception as e:
            logging.error(f"Error processing DOCX {docx_path}: {str(e)}")
            return None

    def clean_text(self, text):
        if text:
            # Remove extra whitespace
            text = ' '.join(text.split())
            # Remove unnecessary line breaks
            text = text.replace('\n\n', '\n')
            return text
        return ""

    def process_document(self, file_path):
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == '.pdf':
            return self.extract_from_pdf(file_path)
        elif file_extension in ['.docx', '.doc']:
            return self.extract_from_docx(file_path)
        else:
            logging.warning(f"Unsupported file format: {file_path}")
            return None

    def process_folder(self):
        results = []

        # Get list of files
        files = [f for f in os.listdir(self.input_folder)
                if f.lower().endswith(('.pdf', '.docx', '.doc'))]

        print(f"Found {len(files)} files to process")

        # Process each file
        for file_name in tqdm(files):
            file_path = os.path.join(self.input_folder, file_name)

            logging.info(f"Processing {file_name}")

            try:
                # Extract text
                extracted_text = self.process_document(file_path)

                if extracted_text:
                    # Clean text
                    cleaned_text = self.clean_text(extracted_text)

                    # Save individual text file
                    output_file = os.path.join(
                        self.output_folder,
                        f"{os.path.splitext(file_name)[0]}.txt"
                    )

                    with open(output_file, 'w', encoding='utf-8') as f:
                        f.write(cleaned_text)

                    results.append({
                        'file_name': file_name,
                        'status': 'success',
                        'text': cleaned_text
                    })

                    logging.info(f"Successfully processed {file_name}")
                else:
                    results.append({
                        'file_name': file_name,
                        'status': 'failed',
                        'text': ''
                    })
                    logging.error(f"Failed to extract text from {file_name}")

            except Exception as e:
                logging.error(f"Error processing {file_name}: {str(e)}")
                results.append({
                    'file_name': file_name,
                    'status': 'error',
                    'text': str(e)
                })

        # Create summary DataFrame
        df = pd.DataFrame(results)
        df.to_csv(os.path.join(self.output_folder, 'extraction_summary.csv'), index=False)

        return results

In [None]:
input_folder = '/content/drive/MyDrive/CVs'  # Adjust path as needed
output_folder = '/content/drive/MyDrive/CVs/output'  # Adjust path as needed

# Create extractor instance
extractor = CVTextExtractor(input_folder, output_folder)

# Process all documents
results = extractor.process_folder()

# Print summary
success_count = sum(1 for r in results if r['status'] == 'success')
print(f"\nProcessing complete!")
print(f"Successfully processed: {success_count}/{len(results)} files")
print(f"Results saved to: {output_folder}")

Found 25 files to process


 28%|██▊       | 7/25 [01:39<05:37, 18.73s/it]ERROR:root:Failed to extract text from SW_ML_EsraaSayed.docx
 48%|████▊     | 12/25 [02:54<04:07, 19.02s/it]ERROR:root:Failed to extract text from SW_MLEngineer_MoatazMansour.docx
 68%|██████▊   | 17/25 [03:36<01:24, 10.61s/it]ERROR:root:Failed to extract text from DataScientist_MahmoudYoussef.docx
100%|██████████| 25/25 [05:04<00:00, 12.18s/it]


Processing complete!
Successfully processed: 22/25 files
Results saved to: /content/drive/MyDrive/CVs/output



