In [None]:
%pip install pdfminer.six
%pip install numpy
%pip install pandas
%pip install pytz
%pip install tika
%pip install spacy
!python -m spacy download en_core_web_sm
%pip install opencv-python
%pip install pymupdf

from google.colab import drive
drive.mount('/content/drive')

In [15]:
import os
from datetime import datetime
import json
import re
from itertools import chain
import numpy as np
import pandas as pd
import pytz 
import spacy
import fitz  #pymudf
import cv2 #opencv-python
from google.colab.patches import cv2_imshow

# Load spacy pre-trained model
nlp = spacy.load('en_core_web_sm')

# Timezone
timezone = pytz.timezone('Asia/Jakarta')

# Regex / Reference List
UNIVERSITIES_REF = '/content/drive/MyDrive/Project/!Playground/world-universities.csv'
MAJOR_REF = '/content/drive/MyDrive/Project/!Playground/majors-list.csv'
SKILL_REF = '/content/drive/MyDrive/Project/!Playground/skills.csv'
DEGREE_REF = ['s3','doctoral','doktor','s2','master','magister','s1','bachelor','sarjana','vokasi']
OCCUPATION_REF = '/content/drive/MyDrive/Project/!Playground/occupation.csv'
BIRTHPLACE_REF = '/content/drive/MyDrive/Project/!Playground/birthplace.csv'
PHONE_REG = re.compile(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]')
EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')
WEB_REG = re.compile(r'(https?://\S+)')

# Function
def extract_text_from_pdf(pdf_path):
    document  = fitz.open(pdf_path)
    text = []
    for page in range(document.page_count):
        text.append(document.get_page_text(page))
    return ' '.join(text)

def extract_phone_number(text):
    text = text.replace(' ','')
    phone = set(re.findall(PHONE_REG, text))
    if phone:
        for number in phone:
            if len(number) >= 10 and len(number) < 16 and '.' not in number:
                return number
    
def extract_emails(text):
    email = re.findall(EMAIL_REG, text)
    if email:
        mail = email
        return mail[0]

def extract_website(text):
    text = text = re.sub(r'[()]', '', text)
    web = re.findall(WEB_REG, text)
    if web:
        return web

def extract_college(text):
    file = UNIVERSITIES_REF
    df = pd.read_csv(file, header=None)
    universities = [i.lower() for i in df[1]]
    college_name = []

    for i in range(len(text)):
        for univ in universities:
            if univ in text[i]:
                if univ not in college_name:
                    college_name.append(univ)

    if len(college_name) == 0:
        for i in range(len(text)):
            if text[i].startswith('university'):
                if text[i] not in college_name:
                    college_name.append(text[i].strip())  

    if college_name:
      return college_name

def extract_major(text, univ):
    file = MAJOR_REF
    df = pd.read_csv(file)
    major_specific = [x for x in df['SPECIFIC'].str.lower().values if x is not np.nan]
    major_general = [x for x in df['GENERAL'].str.lower().values if x is not np.nan]
    major = set([])
    for i in range(len(text)):
        if univ in text[i]:
            try:
                out = text[i-2] + ' ' + text[i-1] + ' ' + text[i] + ' ' + text[i+1] + ' ' + text[i+2]
            except:
                out = text[i-2] + ' ' + text[i-1] + ' ' + text[i]
            for row in major_specific:
                    if row in out:
                        major.add(row)
            if len(major) == 0:
                for row in major_general:
                    if row in out:
                        major.add(row)
            if major:
                return ' '.join(major)

def extract_degree(text, univ):
    for i in range(len(text)):
        if univ in text[i]:
            try:
                out = text[i-2] + ' ' + text[i-1] + ' ' + text[i] + ' ' + text[i+1] + ' ' + text[i+2]
            except:
                out = text[i-2] + ' ' + text[i-1] + ' ' + text[i]
            for degree in DEGREE_REF:
                if degree in out:
                    return degree

def extract_education2(text):
    filter = ['develop', 'create', 'make', 'collaborate']
    text = [s for s in text.split('\n') if not any(x in s for x in filter)]

    uni = extract_college(text)
    edu = []
    deg = []
  
    for degree in DEGREE_REF:
        for i in range(len(text)):
            if text[i].startswith(degree):
                deg.append(degree)
    
    if uni:
        if len(uni) == 1 and len(deg) > 1:
            for i in range(len(deg)):
                dct = {}
                dct['college'] = uni[0] 
                dct['major'] = extract_major(text, deg[i])
                dct['degree'] = extract_degree(text, uni[0])
                edu.append(dct)
            if len(edu) == 3:
                edu[1]['degree'] = 'master'
                edu[2]['degree'] = 'bachelor'
            elif len(edu) == 2:
                edu[1]['degree'] = 'bachelor'
        else:
            for item in uni:
                dct = {}
                dct['college'] = item 
                dct['major'] = extract_major(text, item)
                dct['degree'] = extract_degree(text, item)
                edu.append(dct)
        return edu

def extract_skills(text):
    nlp_text = nlp(text)
    noun_chunks = nlp_text.noun_chunks
    tokens = [token.text for token in nlp_text if not token.is_stop]
    df = pd.read_csv(SKILL_REF, header=None) 
    skills = df[0].values
    skillset = []
    
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
    
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
   
    skillset = list(set(skillset))
    return sorted(skillset)

def extract_estimated_work_year_experience(text, job):
    text = [x.strip() for x in text.encode("ascii", "ignore").decode().split('\n') if x and x != ' ']
    temp = []

    for j in job:
        for i in range(len(text)):
            if j in text[i]:
                try:
                    out = text[i-1] + ' ' + text[i] + ' ' + text[i+1]
                except:
                    out = text[i-1] + ' ' + text[i]
                temp.append(out)
    
    nlp_text = nlp(' '.join(temp))
    tokens = [token.text for token in nlp_text if not token.is_stop]
    digit = list(set([x for x in tokens if x.isdigit()]))
    
    if digit:
        digit.append(str(datetime.today().year))
        digit = [x for x in digit if len(x)>=4]
        year = [int(x) for x in digit]
        year = [x for x in year if x <= datetime.today().year]
        return str(max(year) - min(year))

def extract_occupation(text):
    filter = ['regression','degree','major','improve','sometimes','create','creating', ' to ','progress']
    text = [x.encode("ascii", "ignore").decode() for x in text.split('\n') if not any(s in x for s in filter)]
    df = pd.read_csv(OCCUPATION_REF)
    joblist_specific = [x for x in df['specific'].str.lower().values if x is not np.nan]
    joblist_general = [x for x in df['general'].str.lower().values if x is not np.nan]
    occupation = set([])

    for i in range(len(text)):
        temp = re.sub(r'[^\w\s]', '',text[i]).strip()
        temp = re.sub(r' +', ' ',temp)
        for job in joblist_specific:
            if job in temp:
                occupation.add(job)
        for job in joblist_general:
            if job in temp and len(temp.split())<8:
                occupation.add(temp)
  
    if occupation:
        return list(occupation)

def pix2np(pix):
    im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
    im = np.ascontiguousarray(im[..., [2, 1, 0]])  
    return im

def extract_photo(pdf, folderpath):
    name = os.path.basename(pdf)
    filepath = os.path.join(folderpath, name + '.jpg')

    doc = fitz.open(pdf) 
    page = doc.load_page(0)
    pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
    img = pix2np(pix)
  
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_alt2.xml')
    faces = face_cascade.detectMultiScale(img, 1.1, 4)
  
    i = 0
    while i == 0:
        i = i + 1
        for (x, y, w, h) in faces:
            #cv2.rectangle(img, (x, y), (x+w, y+h), (0, 0, 255), 2)
            faces = img[y - 50 : y + h + 100, x - 50 : x + w + 50]
            #cv2.imshow(faces)
            #cv2_imshow(faces)
            cv2.imwrite(filepath, faces)
            return filepath

# Main
if __name__ == '__main__':
    
    # Folder cv path
    path = '/content/drive/MyDrive/Project/!Playground/CV'

    # Folder dump face
    FACE_PATH = '/content/drive/MyDrive/Project/!Playground/Face/'

    # Iterate through all file
    cv = []
    for file in os.listdir(path):
        file_path = f"{path}/{file}"
        cv.append(file_path)

    # Input cv
    filepath = cv[5]
    text = extract_text_from_pdf(filepath)
    text = text.lower()

    # Extract information
    created = str(datetime.now(timezone))
    phone_number = extract_phone_number(text)
    email = extract_emails(text)
    website = extract_website(text)
    education = extract_education2(text)
    skills = extract_skills(text)
    occupation = extract_occupation(text)
    yearworkexp = extract_estimated_work_year_experience(text, occupation)
    photo = extract_photo(filepath, FACE_PATH)

    # Create JSON
    jsons = {
        'filepath': filepath,
        'created': created,
        'face_photo' : photo,
        'phone_number': phone_number,
        'email': email,
        'website' : website,
        'estimated_working_year_experience' : yearworkexp,
        'work_experience': occupation,
        'education':education,
        'skills': skills,
        }

    print(json.dumps(jsons, indent=2))

{
  "filepath": "/content/drive/MyDrive/Project/!Playground/CV/Yudho Prakoso-resume.pdf",
  "created": "2022-09-20 19:44:02.890281+07:00",
  "face_photo": "/content/drive/MyDrive/Project/!Playground/Face/Yudho Prakoso-resume.pdf.jpg",
  "phone_number": "82214886517",
  "email": "yudopr10@gmail.com",
  "website": [
    "https://www.linkedin.com/in/yudho-prakoso-a057a323b/",
    "https://freecodecamp.org/certification/",
    "https://doi.org/10.1145/3239283.3239297"
  ],
  "estimated_working_year_experience": "7",
  "work_experience": [
    "mathematics tutor",
    "content ops lead",
    "mathematics teacher"
  ],
  "education": [
    {
      "college": "universitas indonesia",
      "major": "mathematics",
      "degree": "bachelor"
    }
  ],
  "skills": [
    "analysis",
    "bigquery",
    "c",
    "data science",
    "etl",
    "excel",
    "google sheets",
    "information technology",
    "mathematics",
    "mysql",
    "pentaho",
    "plan",
    "postgresql",
    "project manage