In [24]:
from PyPDF2 import PdfReader
import os
import pandas as pd

cvs_in_pdf = [f for f in os.listdir('.') if f.endswith('.pdf')]

technical_skills = ['Pandas', 'Python', 
          'Scikit-Learn', 'Airflow', 'Google Big Query', 'Google Cloud Platform', 'GCP', 
          'Looker', 'Node.js']
important_skills = ['Pandas', 'Scikit-Learn']
soft_skills = ['Problem-solving', 'Teamwork', 'Communication']
languages = ['English', 'German']

In [35]:
cv_df = pd.DataFrame(columns = ['cv','%important_skills', 'matched_imp_skills','%technical_skills', 'matched_tech_skills','%soft_skills', 'matched_soft_skills', 'language'])

for cv in cvs_in_pdf:
    with open(cv, 'rb') as f:
        pdf = PdfReader(f)
        row = {
            'cv': cv,
            '%important_skills': 0,
            'matched_imp_skills': [],
            '%technical_skills' : 0, 
            'matched_tech_skills' : [],
            '%soft_skills' : 0, 
            'matched_soft_skills': [],
            'language': []
        }
        first_page = pdf.pages[0]
        text = first_page.extract_text().lower()
        
        #important skills
        imp_skills_counter = 0
        match_imp = []
        for skill in important_skills:
            if skill.lower() in text:
                imp_skills_counter +=1
                match_imp.append(skill)
        row['%important_skills'] = (imp_skills_counter / len(important_skills))*100
        row['matched_imp_skills'] = match_imp
        
        #technical skills
        tech_skills_counter = 0
        match_tech = []
        for skill in technical_skills:
            if skill.lower() in text:
                tech_skills_counter +=1
                match_tech.append(skill)
        row['%technical_skills'] = (tech_skills_counter / len(technical_skills))*100
        row['matched_tech_skills'] = match_tech

        #soft skills
        soft_skills_counter = 0
        match_soft = []
        for skill in soft_skills:
            if skill.lower() in text:
                soft_skills_counter +=1
                match_soft.append(skill)
        row['%soft_skills'] = (soft_skills_counter / len(soft_skills))*100
        row['matched_soft_skills'] = match_soft

        #language skills
        match_lang = []
        for lang in languages:
            if lang.lower() in text:
                match_lang.append(lang)
        row['language'] = match_lang

        cv_df.loc[len(cv_df)] = row

In [36]:
cv_df

Unnamed: 0,cv,%important_skills,matched_imp_skills,%technical_skills,matched_tech_skills,%soft_skills,matched_soft_skills,language
0,CV0.pdf,50.0,[Pandas],22.222222,"[Pandas, Python]",66.666667,"[Teamwork, Communication]","[English, German]"
1,CV1.pdf,0.0,[],0.0,[],33.333333,[Communication],"[English, German]"
2,CV2.pdf,0.0,[],0.0,[],66.666667,"[Teamwork, Communication]","[English, German]"
3,CV3.pdf,50.0,[Pandas],22.222222,"[Pandas, Python]",66.666667,"[Teamwork, Communication]","[English, German]"


In [37]:
cv_df[cv_df['%important_skills'] >= 50]

Unnamed: 0,cv,%important_skills,matched_imp_skills,%technical_skills,matched_tech_skills,%soft_skills,matched_soft_skills,language
0,CV0.pdf,50.0,[Pandas],22.222222,"[Pandas, Python]",66.666667,"[Teamwork, Communication]","[English, German]"
3,CV3.pdf,50.0,[Pandas],22.222222,"[Pandas, Python]",66.666667,"[Teamwork, Communication]","[English, German]"


## Matching the whole jd

In [3]:
import os


from PyPDF2 import PdfReader

cv_list = [cv for cv in os.listdir('.') if cv.endswith('.pdf') and cv.startswith('CV')]
jd_list = [jd for jd in os.listdir('.') if jd.endswith('.pdf') and jd.startswith('jd')]

jd_text_list = []
cv_text_list = []

for jd in jd_list:
    jd_text = ''
    with open(jd, 'rb') as file:
        pdf = PdfReader(file)
        for page in pdf.pages:
            jd_text += page.extract_text().lower()
    jd_text_list.append(jd_text)



for cv in cv_list:
    cv_text = ''
    with open(cv, 'rb') as file:
        pdf = PdfReader(file)
        for page in pdf.pages:
            cv_text += page.extract_text().lower()
    cv_text_list.append(cv_text)


In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer

sw = set(stopwords.words('english'))
punct = string.punctuation
punct+='•'
ps = PorterStemmer()

# Check later if it is good I idea to call the function in the loop for every text or integrate the loop in the function itself and accept the list
# of text as an argument
def clean_text(text_list):
    cleaned_list = []
    for text in text_list:
        tokens = word_tokenize(text)
        cleaned = ' '.join([ps.stem(word) for word in tokens if word not in sw and word not in punct])
        cleaned_list.append(cleaned)
    return cleaned_list

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# compute similarity

vectorizer = TfidfVectorizer()
def compute_similarity(jd, cv):
    documents = [jd, cv]
    tfidf_matrix = vectorizer.fit_transform(documents)
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0,0]

In [9]:
cleaned_jd_list = clean_text(jd_text_list)
cleaned_cv_list = clean_text(cv_text_list)

In [12]:
jds = {
    jd_list[i] : cleaned_jd_list[i] for i in range(len(jd_list))
}

cvs =  {
    cv_list[i] : cleaned_cv_list[i] for i in range(len(cv_list))
}

dict_keys(['jd1.pdf', 'jd1_alt.pdf', 'jd2.pdf', 'jd_3.pdf', 'jd_4.pdf', 'jd_5.pdf'])

In [55]:
import pandas as pd

df = pd.DataFrame(columns = ['jd','cv', 'similarity'])

for jd in jds.keys():
    jd_t = jds[jd]
    for cv in cvs.keys():
        df_dict = {}
        df_dict['jd'] = jd
        cv_t = cvs[cv]
        df_dict['cv'] = cv
        df_dict['similarity'] = compute_similarity(jd_t, cv_t) * 100
        df.loc[len(df)] = df_dict

In [58]:
total = df['similarity'].sum()
df['relative_similarity']=(df['similarity'] / total)*100

In [59]:
df

Unnamed: 0,jd,cv,similarity,relative_similarity
0,jd1.pdf,CV0.pdf,15.601686,4.923146
1,jd1.pdf,CV1.pdf,7.247228,2.286878
2,jd1.pdf,CV2.pdf,6.561848,2.070605
3,jd1.pdf,CV3.pdf,19.94993,6.295244
4,jd1.pdf,CV4.pdf,27.469837,8.668167
5,jd1_alt.pdf,CV0.pdf,15.784382,4.980796
6,jd1_alt.pdf,CV1.pdf,6.900439,2.177449
7,jd1_alt.pdf,CV2.pdf,6.66228,2.102297
8,jd1_alt.pdf,CV3.pdf,20.250566,6.39011
9,jd1_alt.pdf,CV4.pdf,27.266766,8.604087


For now, I simply calculate the text similarity between the CV and 