* Inference using TF-IDF weighted vectors

* Matching & document similarity calculation using TS-SS method (details in research paper attached to this repo)


In [None]:
from __future__ import unicode_literals
from gensim.summarization import keywords
import pandas as pd
import math
import nltk
import string
import matplotlib.pyplot as plt
from nltk.stem.porter import PorterStemmer
import unicodedata
import nltk
nltk.download('stopwords')
import gensim
import logging

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv('data.csv')
selected_df = df[['job_description','company_name','job_title','category']]
selected_df = selected_df.dropna()
selected_df['new_job_description'] = df['job_description'] + df['category'] 
jds = selected_df['new_job_description'].tolist()
companies = selected_df['company_name'].tolist()
positions = selected_df['job_title'].tolist() #[position of job]


In [None]:
class MyCountVectorizer:
    def __init__(self, docs):
        self.corpus = self.normalize_corpus(docs)
        self.make_features()
        self.make_matrix()
        
    def normalize_corpus(self, docs):    
        table = str.maketrans(string.punctuation, 
                                 len(string.punctuation) * ' ')
        norm_docs = []
        for doc_raw in docs:
            doc = filter(lambda x: x in string.printable, doc_raw)
            '''
            doc = ''
            for x in doc_raw:
                if x in string.printable:
                    doc += x
            '''
            doc = str(doc).translate(table).lower()
            norm_docs.append(doc)
        #self.corpus = norm_docs
        return norm_docs
        
    def make_features(self):
        ''' create vocabulary set from the corpus '''
        stopwords = nltk.corpus.stopwords.words('english')
        self.features = set()
        for doc in self.corpus:
            for word in doc.split():
                if word not in stopwords:
                    self.features.add(word)
        #self.features = set([word for doc in self.corpus for word in doc.split() if word not in stopwords])
        self.features = sorted(list(self.features))

    def make_matrix(self):
        self.matrix = []
        for doc in self.corpus:
            doc_vec = []
            for word in self.features:
                tf = self.term_freq(word, doc)
                doc_vec.append(tf)
            self.matrix.append(doc_vec)

    def term_freq(self, term, document):
        words = document.split()
        count = 0
        for word in words:
            if word == term:
                count += 1
        return count
    
    def print_matrix(self):
        for vec in self.matrix:
            print(vec)

    def get_matrix(self):
        return self.matrix
    
    def get_features(self):
        return self.features
    
    def get_density(self):
        ''' get the density (# of non-zero elements / # all elements )'''
        counter = 0
        total = 0
        for row in self.matrix:
            for item in row:
                if item != 0:
                    counter += 1
                total += 1
        return 1.0 * counter / total

In [None]:
class MyTfIdfVectorizer(MyCountVectorizer):
    ''' inherits from MyCountVectorizer'''

    def make_matrix(self):
        'overriding method'
        self.matrix = []
        for doc in self.corpus:
            doc_vec = []
            for word in self.features:
                tf = self.term_freq(word, doc)
                idf = self.inverse_document_freq(word)
                doc_vec.append(tf * idf)
            #self.matrix.append(doc_vec)
            total = sum(doc_vec)
            doc_vec_norm = [i/total for i in doc_vec]
            self.matrix.append(doc_vec_norm)

    def inverse_document_freq(self, term):
        doc_count = 0
        for document in self.corpus:
            term_count = self.term_freq(term, document)
            if term_count > 0:
                doc_count += 1
        return math.log( 1.0 * len(self.corpus) / doc_count)


In [None]:
resume = 'resume.txt'
with open(resume,'r') as f:
      resume = f.read()
jds.append(resume)

In [None]:
myvec = MyTfIdfVectorizer(jds)

In [None]:
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.INFO)

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format(
    "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", 
    binary=True)

2020-10-01 15:13:18,114 : INFO : loading projection weights from https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-10-01 15:15:08,497 : INFO : loaded (3000000, 300) matrix from https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz


In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
imp = ['java']
p = string.punctuation
d = string.digits
table_p = str.maketrans(p, len(p) * " ")
table_d = str.maketrans(d, len(d) * " ")
vec = []
for jd in jds:
    x = jd.translate(table_p)
    y = x.translate(table_d)
    jd_vector = []
    i = 0
    for word in y.split():
        if word.lower() not in stopwords and len(word)>1 and word not in imp:
            try:
                x = model[word]
                idx = myvec.get_features().index(word)
                z = myvec.get_matrix()[i][idx]
                lst = [a * z for a in x]
                jd_vector.append(lst)
            except:
                continue
        else:
            try:
                x = model[word]
                lst = [a * 2 for a in x]
                jd_vector.append(lst)
            except:
                continue
    i+=1
    vec.append(jd_vector)

In [None]:
mean_vec = []
for j in vec:
    mean = []
    for i in range(300):
        accum =0
        for word in j:
            accum += word[i]
        mean.append(1.0*accum/len(word))
    mean_vec.append(mean)
data = mean_vec

In [None]:
from scipy.spatial import distance
cos_dist =[]
for vec in data[:-1]:
  cos_dist.append(float(distance.cosine(vec,data[-1])))

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [None]:
ps = PorterStemmer()
key_list =[]

for jd in jds[:-1]:
    key = ''
    w = set()
    for word in keywords(jd).split('\n'):
        w.add(ps.stem(word))
    for x in w:
        key += '{} '.format(x)
    key_list.append(key)

In [None]:
summary = pd.DataFrame({
        'Company': companies,
        'Postition': positions,
        'Cosine Distances': cos_dist,
        #'Keywords': key_list,
        'Job Description': jds[:-1]
    })
z = summary.sort_values(by= 'Cosine Distances', ascending=False)
#z.to_csv('Summary.csv',encoding="utf-8")

In [None]:
z.head()

Unnamed: 0,Company,Postition,Cosine Distances,Job Description
1596,Private Advertiser,Owner Drivers,0.883422,Metrans Late-model white 1&2T Vans & Trays 4-1...
1615,CKC Cabinets pty ltd,Cabinet maker,0.740032,We are seeking open minded people who are inte...
2399,Private Advertiser,Carpenter/Concreter OFFSIDER,0.738509,POSITION VACANT Carpenter/Concreter OFFSIDER S...
2948,Private Advertiser,Security Guards,0.70705,Full Time and Casual Work. Must be available f...
1261,Hukarere Girls' College,Weekend Activities Supervisor,0.706251,Hukarere Girls' College Hostel Weekend Activit...
