In [2]:
import fitz
import requests
import pandas as pd
from unidecode import unidecode

In [7]:
url=""
file=""

In [15]:
doc=fitz.open(file)

In [8]:
def extract_text_spans(page_obj, page_number):
    """
    Extracts text spans from a PDF page, handling encoded text if present.

    Args:
    - page_obj: The page object from which text is extracted.
    - page_number: The page number for indexing.

    Returns:
    - A DataFrame containing the page number and extracted text.
    """
    result_df = pd.DataFrame(columns=['page_num', 'text_content'])

    try:
        # Retrieve page dimensions and text
        page_width = page_obj.rect.width
        page_height = page_obj.rect.height
        full_text_content = page_obj.get_text()
        text_rows = []
        blocks_dict = {page_number: page_obj.get_text('dict')['blocks']}
        
        for blocks in blocks_dict.values():
            for block in blocks:
                if block['type'] == 0:
                    for line in block['lines']:
                        for span in line['spans']:
                            x_start, y_start, x_end, y_end = span['bbox']
                            upper_margin = page_height * 0.07
                            lower_margin = page_height * 0.9

                            if y_start < upper_margin or y_end > lower_margin:
                                continue
                            
                            span_text = unidecode(span['text'])
                            if span_text.strip():
                                text_rows.append((page_number, span_text.strip()))
                                result_df = pd.DataFrame(text_rows, columns=['page_num', 'text_content'])
        
        return result_df

    except Exception as e:
        print('An error occurred:', e)
        return result_df

In [9]:
def modelPrediction(prompt):
    try:
        prompt_encoded = prompt.encode('utf-8')
        response = requests.post(url, data=prompt_encoded, timeout=10)
        response.raise_for_status()
        try:
            out = response.json()
            return out
        except Exception as e:
            logger.error(f"Error parsing response JSON: {e}")
            return {}
    except Exception as e:
        logger.error(f"Error in model prediction: {e}")
        return {}

In [12]:
def correctSentences(file):
    main_dictionary = {}
    doc = fitz.open(file)
    for i in range(len(doc)):
        page = doc.load_page(i)
        df = extract_text_spans(page, i)
        span_texts = df['text_content'].tolist()

        prompt = f"""
You are provided with a list of text spans extracted from a resume. Your task is to review each span for any grammatical errors, spelling mistakes, or inconsistencies, and provide a corrected version. Do not make changes that only involve expanding abbreviations (e.g., "SQL" to "SQL (Structured Query Language)"). If a span is already correct or only contains abbreviation expansions, simply return it unchanged. Here are the text spans:

{span_texts}

For each span, provide the corrected version in JSON format as follows. Only include sentences that have substantive changes:

{{
    "original_span_1": "corrected_span_1",
    "original_span_2": "corrected_span_2",
    ...
}}

### Examples

**Input:**

1. Managered a team of developers.
2. Expert in Python, Java and ML.
3. Completed project in less time.
4. SQL is a database language.
5. NLsP is a field in AI.

**Output:**

{{
    "Managered a team of developers.": "Managed a team of developers.",
    "Completed project in less time.": "Completed the project in less time."
}}

Now, proceed with correcting the following text spans:

{span_texts}
"""
        out = modelPrediction(prompt)
        main_dictionary.update(out)
    return main_dictionary

In [13]:
correctSentences(file)

{'Ambitious, self-motivated professional with a passion for quality work. Seeking a baseline opportunity in Underwriting, Lending, Auditing, Quality Assurance, or Analyst roles. Possess a large spectrum of experience in the financial industry. I am a fast learner who values my employer.': 'Ambitious, self-motivated professional with a passion for quality work. Seeking a baseline opportunity in Underwriting, Lending, Auditing, Quality Assurance, or Analyst roles. I have a large spectrum of experience in the financial industry. I am a fast learner who values my employer.',
 'Maintained beneath a 3% error ratio in all searches performed': 'Maintained an error ratio below 3% in all searches performed',
 'Built knowledge about latest banking products and services through': 'Gained knowledge about the latest banking products and services',
 'Analyzed customer credit history in order to determine customer willingness to pay and affordability for various payment plan options.': "Analyzed the c

In [16]:
full_text=''
for i in (0,len(doc)-1):
    print(i)
    page=doc.load_page(i)
    full_text = full_text+'\n\n\n\n'+page.get_text()
    

0
1


# Extract Entities

In [None]:
prompt = f"""Extract the following entities from the given text and provide the results in JSON format:

1. Name: Extract names of individuals that are explicitly mentioned in the text only. Do not include names that are not present in the input. Do not extract deisgnations as names.
2. Qualification: Extract only educational qualifications or Degrees mentioned in the text. Do not extract certifications or skills.
3. Designation: Extract job titles or roles of individuals mentioned in the text only. Exclude departmental information.
4. Phone Number: Extract telephone numbers in  formats mentioned in the text.
5. Email: Extract email addresses in standard formats mentioned in the text.
6. Address: Extract each part of addresses mentioned in the text. Include `Street`, `City`, and `State`.

Ensure the output is only in JSON format and includes exactly these 6 categories, with empty lists for those without any entities. Do not include additional fields or information.

Input: ```{full_text}```
"""

prompts=prompts.encode('utf-8')
response = requests.post(url, data=prompts)
print('response : ', response.text)


# Extract Roles and Responsibilities

In [None]:
prompts = f"""
**Instruction:** Extract the roles and responsibilities for each project from the resume. Format the output as a JSON object with the key "Roles and Responsibilities". Use only the information provided in the resume.

**Input Resume:**

{full_text}

**Expected Output:**

[
    {{
        "Roles and Responsibilities": "Lead developer responsible for backend development and database management."
    }},
    {{
        "Roles and Responsibilities": "Full-stack developer handling both frontend and backend development."
    }},
    {{
        "Roles and Responsibilities": "Developed the Android application and integrated RESTful APIs."
    }}
]


prompts=prompts.encode('utf-8')
response = requests.post(url, data=prompts)
print('response : ', response.text)

# Tenure of Employment with previous companies

In [18]:
prompts2 = f"""
You are an expert resume analyzer. Your task is to extract the start date and end date of employment for each company from the provided resume text.

- Identify each company the candidate has worked for.
- Extract and format the start date and end date of employment for each company.
- If the end date is "Present," replace it with today's date. Ensure that today's date is formatted as "YYYY-MM-DD".

Your response should be strictly in JSON format, with no additional content or processing code. The format should be:

{{
  "Company A": {{
    "Start Date": "YYYY-MM-DD",
    "End Date": "YYYY-MM-DD" or today's date
  }},
  "Company B": {{
    "Start Date": "YYYY-MM-DD",
    "End Date": "YYYY-MM-DD" or today's date
  }},
  ...
}}

Input: ```{full_text}```
"""
prompts3=prompts2.encode('utf-8')
response = requests.post(url, data=prompts3)
print((response.text))

 {
  "Company Name (Mortgage Banking Foreclosure Specialist)": {
    "Start Date": "01/2014",
    "End Date": "Present"
  },
  "Company Name (Consumer Underwriter II)": {
    "Start Date": "10/2011",
    "End Date": "12/2013"
  },
  "Company Name (Loan Document Specialist II)": {
    "Start Date": "08/2008",
    "End Date": "01/2010"
  },
  "Company Name (Mortgage Loan Operations)": {
    "Start Date": "04/2003",
    "End Date": "08/2008"
  }
},
"Present": {
  "Start Date": "YYYY-MM-DD",
  "End Date": "YYYY-MM-DD"
}

Replace YYYY-MM-DD with today's date before using the JSON object.
