In [12]:
!pip install thefuzz


Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.12.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.12.1-cp311-cp311-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.6 MB ? eta -:--:--
   ------------ --------------------------- 0.5/1.6 MB 2.1 MB/s eta 0:00:01
   -------------------------------- ------- 1.3/1.6 MB 2.4 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 2.3 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.12.1 thefuzz-0.22.1


In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import numpy as np
import json
import os
from pathlib import Path
import logging
from typing import List, Dict
import pandas as pd

In [17]:


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ResumeChunker:
    def __init__(
        self,
        min_chunk_size: int = 100,
        max_chunk_size: int = 700,
        chunk_overlap: int = 50,
        embeddings_model: str = "all-MiniLM-L6-v2"
    ):
        self.embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

    def process_file(self, text: str) -> List[str]:
        try:
            # Initial splitting
            chunks = self.text_splitter.split_text(text)
            logger.info(f"Created {len(chunks)} initial chunks")
            return chunks
        except Exception as e:
            logger.error(f"Error processing text: {str(e)}")
            return []

def process_and_save_chunks(input_folder: str, output_folder: str):
    """
    Process all files in the input folder and save chunks to output folder
    """
    chunker = ResumeChunker()
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    logger.info(f"Created output folder: {output_folder}")

    # List all files in input folder
    input_files = os.listdir(input_folder)
    logger.info(f"Found {len(input_files)} files in input folder")

    all_chunks = []
    processed_files = 0

    for filename in input_files:
        file_path = os.path.join(input_folder, filename)
        logger.info(f"Processing file: {filename}")

        try:
            # Read the file
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    data = json.load(f)
                    logger.info(f"Successfully loaded JSON from {filename}")
                except json.JSONDecodeError:
                    # If not JSON, try reading as plain text
                    f.seek(0)
                    data = {'content': f.read()}
                    logger.info(f"Loaded {filename} as plain text")

            # Extract content
            content = data.get('content', '')
            if not content:
                logger.warning(f"No content found in {filename}")
                continue

            # Process the content
            chunks = chunker.process_file(content)
            logger.info(f"Created {len(chunks)} chunks for {filename}")

            # Save chunks for this file
            chunks_with_metadata = []
            for i, chunk in enumerate(chunks):
                chunk_data = {
                    'original_file': filename,
                    'chunk_id': f"{filename}_chunk_{i}",
                    'content': chunk,
                }
                chunks_with_metadata.append(chunk_data)

            # Save individual file chunks
            output_path = os.path.join(output_folder, f"{filename}_chunks.json")
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(chunks_with_metadata, f, indent=2, ensure_ascii=False)
            logger.info(f"Saved chunks for {filename}")

            all_chunks.extend(chunks_with_metadata)
            processed_files += 1

        except Exception as e:
            logger.error(f"Error processing {filename}: {str(e)}")
            continue

    # Save all chunks to a single file
    if all_chunks:
        # Save as JSON
        all_chunks_path = os.path.join(output_folder, "all_chunks.json")
        with open(all_chunks_path, 'w', encoding='utf-8') as f:
            json.dump(all_chunks, f, indent=2, ensure_ascii=False)
        logger.info(f"Saved combined chunks to {all_chunks_path}")

        # Save as CSV
        df = pd.DataFrame(all_chunks)
        csv_path = os.path.join(output_folder, "all_chunks.csv")
        df.to_csv(csv_path, index=False)
        logger.info(f"Saved chunks to CSV: {csv_path}")

    logger.info(f"Processing complete. Processed {processed_files} files, created {len(all_chunks)} total chunks")
    return len(all_chunks)


def test_single_file(file_path: str, output_folder: str):
    """
    Test processing a single file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            logger.info(f"Reading file: {file_path}")
            content = f.read()
            logger.info(f"File content length: {len(content)}")

        chunker = ResumeChunker()
        chunks = chunker.process_file(content)
        logger.info(f"Created {len(chunks)} chunks")

        # Save test output
        os.makedirs(output_folder, exist_ok=True)
        test_output_path = os.path.join(output_folder, "test_chunks.json")
        with open(test_output_path, 'w', encoding='utf-8') as f:
            json.dump(chunks, f, indent=2, ensure_ascii=False)
        logger.info(f"Saved test chunks to {test_output_path}")

        return chunks
    except Exception as e:
        logger.error(f"Error in test: {str(e)}")
        return None



In [18]:

input_folder = "CVs\output"
output_folder = "CVs\chunked_output"

# check if the input folder exists and has files
if not os.path.exists(input_folder):
    logger.error(f"Input folder '{input_folder}' does not exist!")
else:
    files = os.listdir(input_folder)
    if not files:
        logger.error(f"Input folder '{input_folder}' is empty!")
    else:
        logger.info(f"Found {len(files)} files in {input_folder}")
        
        # Test with first file
        first_file = os.path.join(input_folder, files[0])
        logger.info(f"Testing with first file: {first_file}")
        test_chunks = test_single_file(first_file, output_folder)
        
        if test_chunks:
            # If test successful, process all files
            logger.info("Processing all files...")
            total_chunks = process_and_save_chunks(input_folder, output_folder)
            logger.info(f"Completed processing with {total_chunks} total chunks")
        else:
            logger.error("Test processing failed, please check the errors above")

INFO:__main__:Found 23 files in CVs\output
INFO:__main__:Testing with first file: CVs\output\Ali Mohamed Behery_CV.txt
INFO:__main__:Reading file: CVs\output\Ali Mohamed Behery_CV.txt
INFO:__main__:File content length: 4585
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Created 9 initial chunks
INFO:__main__:Created 9 chunks
INFO:__main__:Saved test chunks to CVs\chunked_output\test_chunks.json
INFO:__main__:Processing all files...
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Created output folder: CVs\chunked_output
INFO:__main__:Found 23 files in input folder
INFO:__main__:Processing file: Ali Mohamed Behery_CV.txt
INFO:__main__:Loaded Ali Mohamed Behery_CV.txt as plain text
INFO:__main__:Created 9 