In [None]:
import re
from nltk.corpus import stopwords
from collections import Counter 
import pandas as pd
import PyPDF2
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from scipy.spatial.distance import cosine
from random import randint
import numpy as np

# The following data science skill sets are modified from 
program_languages = ['bash','r','python','java','c++','ruby','perl','matlab','javascript','scala','php']
analysis_software = ['excel','tableau','sas','spss','d3','saas','pandas','numpy','scipy','sps','spotfire','scikit','splunk','power','h2o']
ml_framework = ['pytorch','tensorflow','caffe','caffe2','cntk','mxnet','paddle','keras','bigdl']
bigdata_tool = ['hadoop','mapreduce','spark','pig','hive','shark','oozie','zookeeper','flume','mahout','etl']
ml_platform = ['aws','azure','google','ibm']
methodology = ['agile','devops','scrum']
databases = ['sql','nosql','hbase','cassandra','mongodb','mysql','mssql','postgresql','oracle','rdbms','bigquery']
overall_skills_dict = program_languages + analysis_software + ml_framework + bigdata_tool + databases + ml_platform + methodology
education = ['master','phd','undergraduate','bachelor','mba']
overall_dict = overall_skills_dict + education
# specify the length of each minhash vector
N = 32
max_val = (2**8)-1
#Create N tuples that will serve as permutation functions.These permutation values are used to hash all input sets.
perms = [ (randint(0,max_val), randint(0,max_val)) for i in range(N)]
#Initialize a sample minhash vector of length N.
#Each record will be represented by its own vec.
vec = [float('inf') for i in range(N)]


class skill_keyword_match:
    jobs_info_df = None
    def __init__(self, jobs_list):
        '''
        Initialization - converts list to DataFrame
        Input: 
            jobs_list (list): a list of all jobs info
        Output: 
            None
        '''
        self.jobs_info_df = pd.DataFrame(jobs_list) 
        self.jobs_info_df.rename(columns = {'0':'job_title','1':'company_name', '2':'location','3':'job_desc'})
        print("Dataset:\n")
        print(self.jobs_info_df.head(5))
        
        
    def keywords_extract(self, text): 
        '''
        Tokenize webpage text and extract keywords
        Input: 
            text (str): text to extract keywords from
        Output: 
            keywords (list): keywords extracted and filtered by pre-defined dictionary
        '''        
        text = re.sub("[^a-zA-Z+3]"," ", text) 
        text = text.lower().split()
        stops = set(stopwords.words("english")) 
        #filter out stop words in english language
        text = [w for w in text if not w in stops]
        text = list(set(text))
        #keywords from the pre-defined skill dictionary
        keywords = [str(word) for word in text if word in overall_dict]
        return keywords
 
    def keywords_count(self, keywords, counter): 
        '''
        Count frequency of keywords
        Input: 
            keywords (list): list of keywords
            counter (Counter)
        Output: 
            keyword_count (DataFrame index:keyword value:count)
        '''           
        keyword_count = pd.DataFrame(columns = ['Freq'])
        for each_word in keywords: 
            keyword_count.loc[each_word] = {'Freq':counter[each_word]}
        return keyword_count
    
    def exploratory_data_analysis(self):
        '''
        Exploratory data analysis
        Input: 
            None
        Output: 
            None
        '''         
        # Create a counter of keywords
        doc_freq = Counter() 
        f = [doc_freq.update(item) for item in self.jobs_info_df['keywords']]
        
        #Pre-defined skillset vocabulary in Counter
        overall_skills_df = self.keywords_count(overall_skills_dict, doc_freq)
        #Calculate percentage of required skills in all jobs
        overall_skills_df['Freq_perc'] = (overall_skills_df['Freq'])*100/self.jobs_info_df.shape[0]
        overall_skills_df = overall_skills_df.sort_values(by='Freq_perc', ascending=False)  
        #Make bar plot 
        plt.figure(figsize=(14,8))
        overall_skills_df.iloc[0:30, overall_skills_df.columns.get_loc('Freq_perc')].plot.bar()
        plt.title('Percentage of Required Data Skills in Data Scientist Job Posts')
        plt.ylabel('Percentage Required in Jobs (%)')
        plt.xticks(rotation=30)
        plt.show()
        
         
        #Education requirements
        education_df = self.keywords_count(education, doc_freq)
        #Merge undergrad with bachelor
        education_df.loc['bachelor','Freq'] = education_df.loc['bachelor','Freq'] + education_df.loc['undergraduate','Freq'] 
        education_df.drop(labels='undergraduate', axis=0, inplace=True)
        #Calculate percentage of required skills in all jobs
        education_df['Freq_perc'] = (education_df['Freq'])*100/self.jobs_info_df.shape[0] 
        education_df = education_df.sort_values(by='Freq_perc', ascending=False)  
        
        
    def get_jaccard_sim(self, x_set, y_set): 
        '''
        Jaccard similarity or intersection over union measures similarity 
        between finite sample sets,  and is defined as size of intersection 
        divided by size of union of two sets. 
        
        Input: 
            x_set (set)
            y_set (set)
        Output: 
            Jaccard similarity score
        '''         
        intersection = x_set.intersection(y_set)
        return float(len(intersection)) / (len(x_set) + len(y_set) - len(intersection))
    
    
    def cal_similarity(self, resume_keywords, location=None):
        '''
        Calculate similarity between keywords from resume and job posts
        Input: 
            resume_keywords (list): resume keywords
            location (str): city to search jobs
        Output: 
            top_match (DataFrame): top job matches
        '''     
        num_jobs_return = 20
        similarity = []
        j_info = self.jobs_info_df
        if j_info.shape[0] < num_jobs_return:        
            num_jobs_return = j_info.shape[0]  
        for job_skills in j_info['keywords']:
            similarity.append(self.get_jaccard_sim(set(resume_keywords), set(job_skills)))
        j_info['similarity'] = similarity
        top_match = j_info.sort_values(by='similarity', ascending=False).head(num_jobs_return)        
        # Return top matched jobs
        return top_match
      
        
    def extract_jobs_keywords(self):
        '''
        Extract skill keywords from job descriptions and add a new column 
        Input: 
            None
        Output: 
            None
        
        '''
        self.jobs_info_df.head(5)
        self.jobs_info_df['keywords'] = [self.keywords_extract(job_desc) for job_desc in self.jobs_info_df[3]]
        
        
    def extract_resume_keywords(self, resume_pdf): 
        '''
        Extract key skills from a resume 
        Input: 
            resume_pdf (str): path to resume PDF file
        Output: 
            resume_skills (DataFrame index:keyword value:count): keywords counts
        ''' 
        #Open resume PDF
        resume_file = open(resume_pdf, 'rb')
        #Creating a pdf reader object
        resume_reader = PyPDF2.PdfFileReader(resume_file)
        #Reading each page in PDF
        resume_content = [resume_reader.getPage(x).extractText() for x in range(resume_reader.numPages)]
        # Extract key skills from each page
        resume_keywords = [self.keywords_extract(page) for page in resume_content]
        #Count keywords
        resume_freq = Counter() 
        f = [resume_freq.update(item) for item in resume_keywords] 
        #Resume skill keywords counts
        resume_skills = self.keywords_count(overall_skills_dict, resume_freq)
        
        return(resume_skills[resume_skills['Freq']>0])
            
    def calculate_minhash(self, resume_keywords):
        num_jobs_return = 10
        j_info = self.jobs_info_df
        minhash_similarity =[]
        
        for job_skills in j_info['keywords']:
            #Specify some input sets
            data_resume = set(resume_keywords)
            data_job_keywords = set(job_skills)
            #Minhash vectors for each input set
            vec1 = self.minhash(data_resume)
            vec2 = self.minhash(data_job_keywords)

            #Dividing both vectors by their max values to scale values {0:1}
            vec1 = np.array(vec1) / max(vec1)
            vec2 = np.array(vec2) / max(vec2)
            cos_sim = 1 - cosine(vec1, vec2)
            minhash_similarity.append(cos_sim)
            #Measuring the similarity between the vectors using cosine similarity
            print( '\n Minhash using similarity:', cos_sim )
            
        j_info['Minhash_Similarity'] = minhash_similarity
        top_match_minhash = j_info.sort_values(by='similarity', ascending=False).head(num_jobs_return)        
        #Return top matched jobs
        return top_match_minhash
    
    def minhash(self, s ,prime=4294967311):
        vec = [float('inf') for i in range(N)]
        
        for val in s:
            if not isinstance(val, int): val = hash(val)
            for perm_idx, perm_vals in enumerate(perms):
                a,b = perm_vals
                output = (a * val + b) % prime
                if vec[perm_idx]>output:
                    vec[perm_idx] = output
        return vec