In [1]:
pip install PyPDF2 docx2txt transformers



In [2]:
pip install fitz



In [3]:
pip install PyMuPDF



In [4]:
import os
import glob
import fitz  # PyMuPDF for PDF extraction
import docx2txt  # for extracting text from Word documents
import re
import torch
from transformers import pipeline
import pandas as pd
import logging
from multiprocessing import Pool, set_start_method  # Import set_start_method
from functools import partial

In [5]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Check for GPU availability
if torch.cuda.is_available():
    device_id = 0  # Change this to the ID of your GPU or 'cpu' for CPU mode
    logging.info(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    device_id = 'cpu'
    logging.info("No GPU available, using CPU.")

In [7]:
device_id

0

In [8]:
# Set the multiprocessing start method to 'spawn'
try:
    set_start_method('spawn')
except RuntimeError:
    pass  # Ignore if it's already set

In [9]:
from numba import cuda
cuda.select_device(0)
cuda.close()

In [10]:
print(torch.cuda.device_count())  # Check the number of available GPUs

1


In [12]:
import os
import glob
import fitz  # PyMuPDF for PDF extraction
import docx2txt  # for extracting text from Word documents
import re
import torch
from transformers import pipeline
import pandas as pd

# Define the path to your CVs folder on Google Drive
cv_folder_path = '/content/drive/MyDrive/CVs'

# Initialize the Hugging Face summarization pipeline with custom max_length
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", revision="a4f8f3e", framework="pt", device=0, max_length=200)

# Initialize an empty DataFrame
summary_df = pd.DataFrame(columns=['CV File', 'Summary'])

# Function to summarize text with dynamic max_length
def summarize_text(text):
    try:
        # Calculate a dynamic max_length based on input text length
        max_length = min(len(text) // 2, 200)  # Adjust as needed
        return summarizer(text, max_length=max_length, min_length=30, do_sample=False)[0]['summary_text']
    except Exception as e:
        logging.error(f"Error summarizing text: {str(e)}")
        return ""
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file_path):
    text = ""
    try:
        pdf_document = fitz.open(pdf_file_path)
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            text += page.get_text()
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_file_path}: {str(e)}")
    return text

# Function to extract text from a DOCX file
def extract_text_from_docx(docx_file_path):
    text = ""
    try:
        text = docx2txt.process(docx_file_path, encoding="utf-8")  # Specify encoding
    except Exception as e:
        print(f"Error extracting text from DOCX {docx_file_path}: {str(e)}")
    return text

# Helper function to clean text from non-UTF-8 characters
def clean_non_utf8(text):
    return ''.join(char for char in text if char.isprintable())

# Helper function to split text into smaller chunks
def split_text(text, max_length):
    chunks = []
    words = text.split()
    current_chunk = ""
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_length:
            current_chunk += " " + word
        else:
            chunks.append(current_chunk.strip())
            current_chunk = word
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Helper function to clean and preprocess text
def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text


# Iterate through files in the CV folder
for cv_file in glob.glob(os.path.join(cv_folder_path, '*.*')):
    file_extension = cv_file.split('.')[-1].lower()

    if file_extension == 'pdf':
        cv_text = extract_text_from_pdf(cv_file)
    elif file_extension == 'docx':
        cv_text = extract_text_from_docx(cv_file)
    else:
        # Handle other file types as needed (e.g., plain text files)
        continue

    # Clean and preprocess the extracted text
    cv_text = clean_text(cv_text)
    cv_text = clean_non_utf8(cv_text)

    # Split the CV text into smaller chunks
    text_chunks = split_text(cv_text, max_length=1000)

    # Initialize the summary for this CV
    summary = ""

    # Summarize each text chunk and append to the summary
    for chunk in text_chunks:
        chunk_summary = summarizer(chunk, max_length=200, min_length=30, do_sample=False)[0]['summary_text']
        summary += chunk_summary + " "

    # Append the CV file path and summary to the DataFrame
    summary_df = summary_df.append({'CV File': cv_file, 'Summary': summary}, ignore_index=True)

# Save the summary DataFrame to a CSV file
summary_df.to_csv('/content/drive/MyDrive/CVs/Summary_Sheet.csv', index=False)


  summary_df = summary_df.append({'CV File': cv_file, 'Summary': summary}, ignore_index=True)
Your max_length is set to 200, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
  summary_df = summary_df.append({'CV File': cv_file, 'Summary': summary}, ignore_index=True)
Your max_length is set to 200, but your input_length is only 188. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)
Your max_length is set to 200, but your input_length is only 172. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=86)
Your max_length is set to 200, but your input_length is only 18

In [14]:
summary_df.size

130

In [15]:
summary_df.head()

Unnamed: 0,CV File,Summary
0,/content/drive/MyDrive/CVs/Aman_Sharma.pdf,Aman Sharma is a software developer 2+ years ...
1,/content/drive/MyDrive/CVs/Aman_Jain.pdf,AMAN Jain has well-honed skills in computer s...
2,/content/drive/MyDrive/CVs/Debashish_Dey_AEMAA...,"Debashish Dey is an experienced IT project, p..."
3,/content/drive/MyDrive/CVs/Amarjeet_Sarma.pdf,AMARJEET SARMA Fullstack and Mobile Developer...
4,/content/drive/MyDrive/CVs/ankit asthana.pdf,ISTQB foundation certified testing profession...
