In [None]:
!pip install pdfminer.six
!pip install numpy
!pip install tika
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install opencv-python
!pip install pymupdf

from google.colab import drive
drive.mount('/content/drive')

In [23]:
import os
from datetime import datetime
from dateutil.parser import parse
import json
import re
import numpy as np
import pandas as pd
import pytz 
from pdfminer.high_level import extract_text
from tika import parser  
import spacy
import fitz  
import cv2
from google.colab.patches import cv2_imshow

# Load spacy pre-trained model
nlp = spacy.load('en_core_web_sm')

# Timezone
timezone = pytz.timezone('Asia/Jakarta')

# Regex / Reference List
UNIVERSITIES_REF = '/content/drive/MyDrive/Project/!Playground/world-universities.csv'
MAJOR_REF = '/content/drive/MyDrive/Project/!Playground/majors-list.csv'
SKILL_REF = '/content/drive/MyDrive/Project/!Playground/skills.csv'
DEGREE_REF = ['s1','s2','s3','vokasi','bachelor','master','doctoral','sarjana','magister','doktor']
OCCUPATION_REF = '/content/drive/MyDrive/Project/!Playground/occupation.csv'
PHONE_REG = re.compile(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]')
EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')
WEB_REG = re.compile(r'(https?://\S+)')

# Function
def extract_text_from_pdf(pdf_path, option):
    if option == 'tika':
      return parser.from_file(pdf_path)['content']
    elif option == 'pdfminer':
      return extract_text(pdf_path)

def extract_phone_number(text):
    phone = re.findall(PHONE_REG, text)
    if phone:
        number = phone[0]
        if text.find(number) >= 0 and len(number) < 16:
            return number
    
def extract_emails(text):
    email = re.findall(EMAIL_REG, text)
    if email:
        mail = email
        return mail[0]

def extract_website(text):
    text = text = re.sub(r'[()]', '', text)
    web = re.findall(WEB_REG, text)
    if web:
        return web

def extract_college(text):
    file = UNIVERSITIES_REF
    df = pd.read_csv(file, header=None)
    universities = [i.lower() for i in df[1]]
    college_name = []
    listex = universities
    listsearch = [text]

    for i in range(len(listex)):
        for j in range(len(listsearch)): 
            if re.findall(listex[i], re.sub(' +', ' ', listsearch[j])):    
                college_name.append(listex[i])
    if college_name:    
        return college_name

def extract_major(text, univ):
    file = MAJOR_REF
    df = pd.read_csv(file, header=None)
    major = df[0].str.lower().values
    text = [x for x in text.split('\n') if x and x != ' ']
    for i in range(len(text)):
        if univ in text[i]:
            try:
                out = text[i-1]+text[i]+text[i+1]
                for row in major:
                    if row in out:
                        return row
            except:
                out = text[i-1]+text[i]
                for row in major:
                    if row in out:
                        return row

def extract_degree(text, univ):
    text = [x for x in text.split('\n') if x and x != ' ']
    for i in range(len(text)):
        if univ in text[i]:
            try:
                out = text[i-1]+text[i]+text[i+1]
                for degree in DEGREE_REF:
                    if degree in out:
                        return degree
            except:
                out = text[i-1]+text[i]
                for degree in DEGREE_REF:
                    if degree in out:
                        return degree

def extract_education(text):
    text = text[text.index('education'):]
    uni = extract_college(text)
    edu = []
    if uni:
        for item in uni:
            dct = {}
            dct['college'] = item 
            dct['major'] = extract_major(text, item)
            dct['degree'] = extract_degree(text, item)
            edu.append(dct)
        return edu

# EXPERIMENTAL take master/doctoral in same college
def extract_education2(text):
    uni = extract_college(text)
    edu = []
    deg = []
  
    for degree in DEGREE_REF:
        if degree in text:
            deg.append(degree)
 
    if uni:
        if len(uni) == 1 and len(deg) > 1:
            for i in range(len(deg)):
                dct = {}
                dct['college'] = uni[0] 
                dct['major'] = extract_major(text, uni[0])
                dct['degree'] = extract_degree(text, uni[0])
                edu.append(dct)
            if len(edu) == 3:
                edu[1]['degree'] = 'master'
                edu[2]['degree'] = 'bachelor'
            elif len(edu) == 2:
                edu[1]['degree'] = 'bachelor'
        else:
            for item in uni:
                dct = {}
                dct['college'] = item 
                dct['major'] = extract_major(text, item)
                dct['degree'] = extract_degree(text, item)
                edu.append(dct)
        return edu

def extract_skills(text):
    nlp_text = nlp(text)
    noun_chunks = nlp_text.noun_chunks
    tokens = [token.text for token in nlp_text if not token.is_stop]
    df = pd.read_csv(SKILL_REF, header=None) 
    skills = df[0].values
    skillset = []
    
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
    
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
   
    skillset = list(set(skillset))
    return skillset 

def extract_estimated_work_year_experience(text):
    tex = [x for x in text.split('\n') if x and x != ' ']

    indices = []
    for i in range(len(tex)):
        line = tex[i]
        if 'dob' in line or 'birth' in line or 'born' in line:
            indices.append(i)

    tex = [i for j, i in enumerate(tex) if j not in indices]
    text = nlp(' '.join(tex))
    tokens = [token.text for token in text if not token.is_stop]
    digit = list(set([x for x in tokens if x.isdigit() or x == 'present']))

    if 'present' in digit:
        digit[digit.index('present')] = str(datetime.today().year)
  
    digit = [x for x in digit if len(x)>=4]
    year = [int(x) for x in digit]
    year = [x for x in year if x <= datetime.today().year]
  
    if year:
        return max(year) - min(year) - 4

def extract_occupation(text):
    text = ' '.join([x for x in text.split('\n') if x and x != ' '])
    df = pd.read_csv(OCCUPATION_REF, header=None)
    joblist = df[0].str.lower().values
    occupation = []

    for job in joblist:
        if (' '+ job + ' ') in text:
            occupation.append(job)
  
    if occupation:
        return occupation

def pix2np(pix):
    im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
    im = np.ascontiguousarray(im[..., [2, 1, 0]])  
    return im

def extract_photo(pdf, folderpath):
    name = os.path.basename(pdf)
    filepath = os.path.join(folderpath, name + '.jpg')

    doc = fitz.open(pdf) 
    page = doc.load_page(0)
    pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
    img = pix2np(pix)
  
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_alt2.xml')
    faces = face_cascade.detectMultiScale(img, 1.1, 4)
  
    i = 0
    while i == 0:
        i = i + 1
        for (x, y, w, h) in faces:
            cv2.rectangle(img, (x, y), (x+w, y+h), (0, 0, 255), 2)
            faces = img[y:y + h, x:x + w]
            #cv2.imshow(faces)
            #cv2_imshow(faces)
            cv2.imwrite(filepath, faces)
            return filepath

if __name__ == '__main__':
    
    # Folder cv path
    path = '/content/drive/MyDrive/Project/!Playground/CV'

    # Folder dump face
    FACE_PATH = '/content/drive/MyDrive/Project/!Playground/Face/'

    # Iterate through all file
    cv = []
    for file in os.listdir(path):
        file_path = f"{path}/{file}"
        cv.append(file_path)

    # Input cv
    filepath = cv[0]
    text = extract_text_from_pdf(filepath, 'tika') #apache tika or pdfminer
    text = text.lower()

    # Extract information
    created = datetime.now(timezone)
    phone_number = extract_phone_number(text)
    email = extract_emails(text)
    website = extract_website(text)
    education = extract_education2(text)
    skills = extract_skills(text)
    yearworkexp = extract_estimated_work_year_experience(text)
    occupation = extract_occupation(text)
    photo = extract_photo(filepath, FACE_PATH)

    # Create JSON
    jsons = {
        'filepath': filepath,
        'created': str(created),
        'photo' : photo,
        'phone_number': phone_number,
        'email': email,
        'estimated_working_year_experience' : str(yearworkexp),
        'working_experience': occupation,
        'website' : website,
        'skills': skills,
        'education':education
        }

    print(json.dumps(jsons, indent=2))

{
  "filepath": "/content/drive/MyDrive/Project/!Playground/CV/Yudho Prakoso-resume.pdf",
  "created": "2022-09-13 21:38:38.257215+07:00",
  "photo": "/content/drive/MyDrive/Project/!Playground/Face/Yudho Prakoso-resume.pdf.jpg",
  "phone_number": "82213886517",
  "email": "yudopr10@gmail.com",
  "estimated_working_year_experience": "4",
  "working_experience": [
    "lead",
    "tutor"
  ],
  "website": [
    "https://www.linkedin.com/in/yudho-prakoso-a057a323b/",
    "https://freecodecamp.org/certification/",
    "https://doi.org/10.1145/3239283.3239297"
  ],
  "skills": [
    "data science",
    "excel",
    "sql server",
    "ui",
    "analysis",
    "pentaho",
    "python",
    "teaching",
    "c",
    "information technology",
    "postgresql",
    "project management",
    "sql",
    "mysql",
    "mathematics",
    "etl",
    "plan",
    "tableau",
    "google sheets",
    "bigquery"
  ],
  "education": [
    {
      "college": "universitas indonesia",
      "major": "mathematic