In [1]:
from gensim.models import KeyedVectors

print('loading model...')
#loading the downloaded model
model =  KeyedVectors.load_word2vec_format('SO_vectors_200.bin', binary=True)
print('model loaded successfully')
#the model is loaded. It can be used to perform all of the tasks mentioned above.


loading model...
model loaded successfully


In [3]:
import numpy as np


class DocSim:
    def __init__(self, w2v_model, stopwords=None):
        self.w2v_model = w2v_model
        self.stopwords = stopwords if stopwords is not None else []

    def vectorize(self, doc: str) -> np.ndarray:
        """
        Identify the vector values for each word in the given document
        :param doc:
        :return:
        """
        doc = doc.lower()
        words = [w for w in doc.split(" ") if w not in self.stopwords]
        word_vecs = []
        for word in words:
            try:
                vec = self.w2v_model[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass

        # Assuming that document vector is the mean of all the word vectors
        # PS: There are other & better ways to do it.
        vector = np.mean(word_vecs, axis=0)
        return vector

    def _cosine_sim(self, vecA, vecB):
        """Find the cosine similarity distance between two vectors."""
        csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))

        if np.isnan(np.sum(csim)):
            return 0
        return csim

    def calculate_similarity(self, dircvs, source_doc, target_docs=None, threshold=0):
        """Calculates & returns similarity scores between given source document & all
        the target documents."""
        if not target_docs:
            return []

        #Vectorize using Word2Vec
        source_vec = self.vectorize(source_doc)
        results = []
        for i in range(len(target_docs)):
            #Vectorize using Word2Vec
            target_vec = self.vectorize(target_docs[i])
            #calculating similarity using cosine similarity
            sim_score = self._cosine_sim(source_vec, target_vec)
            if sim_score > threshold:
                results.append({"score": sim_score, "doc": dircvs[i]})
            # Sort results by score in desc order
            results.sort(key=lambda k: k["score"], reverse=True)

        return results
        # dircvs[target_docs.index(doc)]

In [3]:
from gensim.models.keyedvectors import KeyedVectors
# from DocSim import DocSim
import textract
from os import listdir
from os.path import isfile, join

# ds = DocSim(model)
dir_cvs = './CVS'


ds = DocSim(model)

target_docs = []

def read_All_CV(filename):
    text = textract.process(filename)
    return text.decode('utf-8')

dircvs = [join(dir_cvs, f) for f in listdir(dir_cvs) if isfile(join(dir_cvs, f))]

for cv in dircvs:
    text = read_All_CV(cv)
    target_docs.append(text)

source_doc = textract.process('source.docx')
source_doc = source_doc.decode('utf-8')

print(source_doc)

sim_scores = ds.calculate_similarity(dircvs, source_doc, target_docs)

print(sim_scores)


Bsc degree of Computer Science or equivalent. 

		Familiarity with vulnerability assessment and penetration best practices 

		Experience with vulnerability and penetration testing techniques and tools 

		3 or more years of hands-on penetration testing experience 

		3 or more years of hands-on red team testing experience 

		One of the following certificates at least (CREST, OSCP , GPEN , GIAC) 

		Competency in common operating systems (e.g. Windows, macOS, Linux) 

		Proficiency with at least two scripting languages (e.g. Python, Bash, JavaScript, PowerShell)
[{'score': 0.8357759, 'doc': './CVS/penetration.docx'}, {'score': 0.81228137, 'doc': './CVS/Abdelrahman Eid CV 104.pdf'}, {'score': 0.73832136, 'doc': './CVS/Resume2019.pdf'}, {'score': 0.73459524, 'doc': './CVS/Ahmed-Ahmed.pdf'}, {'score': 0.7204977, 'doc': './CVS/PHPDeveloper-CV-Gehad-Al-Shepeny.pdf'}, {'score': 0.7137368, 'doc': './CVS/mean_stack.docx'}, {'score': 0.6440077, 'doc': './CVS/Abdullah Mustafa Zakaria Nassar Res

In [20]:
from tkinter.filedialog import askopenfilenames, askopenfilename
import tkinter as tk
import os
import textract
from gensim.models.keyedvectors import KeyedVectors


def getJobDescription(filename):
    with open(filename) as f:
        lines = f.readlines()
    lines=''.join(lines)
    print(lines)
    entry.delete(0.0,"end")
    entry.insert(0.0,lines)

def openFileSelector():
    filename = askopenfilename()
    getJobDescription(filename)

def writeSelectedCVs():
    global CVs
    str="Selected CVs\n"
    for cv in CVs:
        str+=os.path.basename(cv)+"\n"
    label=tk.Label(root,text=str)
    canvas.create_window(1000,150, window=label)
ds = DocSim(model)

def getCVs():
    global CVs
    CVs = list(askopenfilenames())
    print(CVs)
    writeSelectedCVs()

def read_All_CV(filename):
    text = textract.process(filename)
    return text.decode('utf-8')
def sortCVs():
    global CVs
    jobDesc = entry.get(0.0,"end")
    targetDocs = []
    for cv in CVs:
        targetDocs.append(read_All_CV(cv))
    sim_scores = ds.calculate_similarity(CVs, jobDesc, targetDocs)
    # print('sim = ', sim_scores)
    str = "Sorted CVs\n"
    for cv in sim_scores:
        str += os.path.basename(cv['doc']) + ", " + repr(cv['score']) + "\n"
        
    label = tk.Label(root, text=str)
    canvas.create_window(1300, 150, window=label)


CVs = []
root = tk.Tk()

canvas = tk.Canvas(root, width=1500, height=1000)
canvas.pack()

entry = tk.Text(root, height=30, width=70)

canvas.create_window(300, 260, window=entry)

button1 = tk.Button(text='Get Job Description from file', command=openFileSelector)
button2 = tk.Button(text='Get CVs', command=getCVs)
button3 = tk.Button(text='Sort CVs', command=sortCVs)
x = 700
canvas.create_window(x, 100, window=button1)
canvas.create_window(x, 200, window=button2)
canvas.create_window(x, 300, window=button3)

root.mainloop()

Bsc degree of Computer Science or equivalent.
Familiarity with vulnerability assessment and penetration best practices
Experience with vulnerability and penetration testing techniques and tools
3 or more years of hands-on penetration testing experience
3 or more years of hands-on red team testing experience
One of the following certificates at least (CREST, OSCP , GPEN , GIAC)
Competency in common operating systems (e.g. Windows, macOS, Linux)
Proficiency with at least two scripting languages (e.g. Python, Bash, JavaScript, PowerShell)
['/home/saad/Documents/fourth-year/first-term/graduation-project/model/CVS/Abdelrahman Eid CV 104.pdf', '/home/saad/Documents/fourth-year/first-term/graduation-project/model/CVS/Abdullah Mustafa Zakaria Nassar Resume.pdf', '/home/saad/Documents/fourth-year/first-term/graduation-project/model/CVS/Ahmed-Ahmed.pdf', '/home/saad/Documents/fourth-year/first-term/graduation-project/model/CVS/CV-PHP-Developer.pdf', '/home/saad/Documents/fourth-year/first-term/g