### 8. Load in tfidf vectorizer and SGD regressor models
### 9. Query input text
 * Input:  
  The main function query_predict takes text string as input. It is not mandatory to include the job title, but including the job title does improve the prediction accuracy. You also have the option to specify the career cluster
to predict the most accurate results.

* Pipeline:  
The query text will go through the pipeline of tfidf vectorization, score prediction by regression models, and search nearest jobs based on skills score in the trainig data. The scores returned to final results are normalized to percentile. However, the scores used for calculating distances between jobs are not normalized.       


* Output:  
The output results contain the top 10 skills and knowledge required for the query job. It also recommends most similar jobs to the query job. To protect proprietary information, only generic job titles from O*NET will be displayed. 

## 0. Load in Library

In [1]:
import re
import pandas as pd
import string
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn import linear_model
from sklearn.externals import joblib
from sklearn.metrics.pairwise import pairwise_distances
from collections import defaultdict
from sklearn.neighbors import NearestNeighbors
from scipy.stats import percentileofscore
from sklearn.metrics import pairwise
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import json
import io

* Load in stopwords list, and predictor names

In [2]:
def readJson(filename):
    with open(filename,'r') as f:
        s = json.load(f)
    return s

predictors = readJson('predictors.json')
all_predictors=predictors['Knowledge']+predictors['Skills']

my_stop_words = set(readJson('stopwords.json'))


## 8. Load in Models and Matrix for Distance Calculation 

In [106]:
tfidf_vect = joblib.load("score_model_2/tfidf.vect")
best_estimator={}
for predictor in all_predictors:
    best_estimator[predictor] = joblib.load("score_model_2/"+predictor+".sgd")

In [107]:
listing_train_normalized_score_df = pd.read_csv('score_model_2/listing_train_normalized_score.csv')
listing_train_for_query_df = pd.read_csv('score_model_2/listing_train_for_query.csv')

In [148]:
listing_train_for_query_df['Career Cluster'].unique()

array(['Government & Public Administration', 'Architecture & Construction',
       'Finance', 'Marketing', 'Hospitality & Tourism', 'Manufacturing',
       'Human Services', 'Science, Technology, Engineering & Mathematics',
       'Law, Public Safety, Corrections & Security', 'Health Science',
       'Information Technology', 'Education & Training',
       'Business Management & Administration',
       'Transportation, Distribution & Logistics',
       'Arts, Audio/Video Technology & Communications',
       'Agriculture, Food & Natural Resources', nan], dtype=object)

* my_prediction function input: query dataframe, tfidf vectorizer, SGD Models

In [108]:
class my_prediction:
    def __init__(self,query_df,tfidf_vect,models):
        self.query_df = query_df
        self.tfidf_vect = tfidf_vect
        self.models = models
        self.clean_text()
        self.vectorizer()

    def clean_text(self):
        self.query_df['clean_text'] = self.query_df['text'].apply(lambda x: re.sub(r'\s',' ', x).lower())

    def vectorizer(self):
        self.query_matrix = self.tfidf_vect.transform(self.query_df['clean_text'])
        return self

    def predict_score(self):
        self.score_df = self.query_df.copy()
        if len(self.query_matrix.nonzero()[1]) == 0:
            for predictor in self.models.keys():
                self.score_df[predictor] = 0
      
        else:     
            for predictor in self.models.keys():     
                prediction = self.models[predictor].predict(self.query_matrix)
                self.score_df[predictor]=prediction
        return self.score_df

* function for normalizing the predicted score to percentile
* The calculation refers to train data set predicted scores. 

In [109]:
def cal_percentile(s,col):
    score = percentileofscore(listing_train_for_query_df[col].values, s, kind='mean')
    return score

def normalize_score(df,all_predictors):
    df_normalized = df.copy()
    for col in all_predictors:       
        df_normalized[col]=df[col].apply(lambda x: cal_percentile(x,col))
    return df_normalized

## 9. Query input text

In [126]:
def get_nearest_jobs(query_score,cols = all_predictors,c_cluster=None):
    recommended_jobs=defaultdict(dict)
    if c_cluster:
        subset_train= listing_train_for_query_df.loc[listing_train_for_query_df['Career Cluster']==c_cluster,:]
        nn = NearestNeighbors(n_neighbors=50, algorithm='brute', metric='euclidean', n_jobs=1)
        nn.fit(subset_train[cols])
        nn_index = nn.radius_neighbors(query_score[cols],radius=2.0, return_distance=True)   
        for i,index in enumerate(nn_index[1][0]):
            [job,cluster] = subset_train.iloc[index][['Occupation','Career Cluster']] 
            recommended_jobs[i]['job']=job
            recommended_jobs[i]['career cluster']= c_cluster
        return recommended_jobs
 
    else:
        nn = NearestNeighbors(n_neighbors=50, algorithm='brute', metric='euclidean', n_jobs=1)
        nn.fit(listing_train_for_query_df[cols])
        nn_index = nn.kneighbors(query_score[cols],return_distance=True) 
        for i,index in enumerate(nn_index[1][0]): 
            if nn_index[0][0][i]<3.0:
                [job,cluster] = listing_train_for_query_df.iloc[index][['Occupation','Career Cluster']] 
                recommended_jobs[i]['job']=job
                recommended_jobs[i]['career cluster']= cluster
        return recommended_jobs

In [141]:
def dict_to_list(score_dict):
    table_content = []
    for i in score_dict:
        k = {}
        k['name'] = i[0]
        k['score'] = int(i[1])
        table_content.append(k)
    return table_content

def order_jobs(jobs_dict, top_k=10):
    r_jobs = []
    for i, v in enumerate(jobs_dict.values()[0:top_k]):
        jobs = {}
        jobs['name'] = v['job']
        jobs['score'] = v['career cluster']
        r_jobs.append(jobs)
    return r_jobs

def query_predict(query_text, career_cluster):
    query_result = []
    query_predicted_score = pd.DataFrame()
    query_text = re.sub('[^a-zA-Z ]', ' ', query_text)
    query_point = pd.DataFrame([{'text': query_text}])
    
    
    query_prediction = my_prediction(query_point, tfidf_vect, best_estimator)
    query_predicted_score = query_prediction.predict_score()
    query_normalized_score = normalize_score(query_predicted_score, all_predictors)
    
    skills_score_dict = query_normalized_score[predictors['Skills']].to_dict('records')
    skills_score_dict = sorted(skills_score_dict[0].items(), key=lambda (k, v): v, reverse=True)[0:10]
    skills_ordered = dict_to_list(skills_score_dict)

    knowledge_score_dict = query_normalized_score[predictors['Knowledge']].to_dict('records')
    knowledge_score_dict = sorted(knowledge_score_dict[0].items(), key=lambda (k, v): v, reverse=True)[0:10]
    knowledge_ordered = dict_to_list(knowledge_score_dict)
    query_result.append({'tablename': 'Skills',
                         'tablecontents': skills_ordered})
    query_result.append({'tablename': 'Knowledge',
                         'tablecontents': knowledge_ordered})

    recommended_jobs = get_nearest_jobs(query_predicted_score, cols=all_predictors, c_cluster=career_cluster)
    recommended_jobs_ordered = order_jobs(recommended_jobs)
    query_result.append({'tablename': 'recommended_jobs',
                         'tablecontents': recommended_jobs_ordered})


    return query_result


In [142]:
input_text='''
Entry Level Software Engineer
Jersey City, NJFull-time
Company Description
Netcloudtek LLC is an IT consulting firm strategically focusing on helping clients to increase business potential. We look forward to continuously enhance our world class business practices to consistently deliver pioneering, comprehensive and cost effective Application Development,Project Management,Infrastructure Management, Cloud-Management, Enterprise, Outsourcing, Industrial, Engineering, Information Technology Services & Solutions.
Job Description
The Entry Level Software Developer is a motivated, energetic, team player who can quickly learn new systems and technologies. This individual will be able to apply their knowledge of software engineering to solve real-world problems for our customers. For this requirement we will hire only GC-EAD, GC, USC, L2-EAD, TPS, Aslyum -EAD visa.

Responsibilities:

Develop/Test application programs, automated process, and other computer systems from detailed or high-level design documents
Program enhancements, code fixes and fulfill ad-hoc requests
Improve support processes and procedures
Performs other related duties as required or requested
Qualifications
Education equivalent to Bachelor's Degree in Information Technology, or the equivalent in related work experience
 '''

In [152]:
career_clusters ={1:'Government & Public Administration',
                  2:'Architecture & Construction',      
                  3:'Finance',
                  4:'Marketing',
                  5:'Hospitality & Tourism',
                  6:'Human Services',
                  7:'Science, Technology, Engineering & Mathematics',
                  8:'Law, Public Safety, Corrections & Security',
                  9:'Health Science',
                  10:'Information Technology',
                  11:'Business Management & Administration',
                  12:'Transportation, Distribution & Logistics',
                  13:'Arts, Audio/Video Technology & Communications',
                  14:'Agriculture, Food & Natural Resources',
                  15:'Manufacturing',
                  16:'Education & Training'}

In [153]:
query_result = query_predict(input_text,career_clusters[10]) 

In [154]:
query_result

[{'tablecontents': [{'name': u'Programming', 'score': 95},
   {'name': u'Operations Analysis', 'score': 91},
   {'name': u'Technology Design', 'score': 91},
   {'name': u'Installation', 'score': 86},
   {'name': u'Systems Analysis', 'score': 85},
   {'name': u'Mathematics', 'score': 85},
   {'name': u'Quality Control Analysis', 'score': 82},
   {'name': u'Systems Evaluation', 'score': 80},
   {'name': u'Science', 'score': 79},
   {'name': u'Troubleshooting', 'score': 77}],
  'tablename': 'Skills'},
 {'tablecontents': [{'name': u'Computers and Electronics', 'score': 97},
   {'name': u'Engineering and Technology', 'score': 96},
   {'name': u'Telecommunications', 'score': 95},
   {'name': u'Design', 'score': 94},
   {'name': u'Mathematics_Knowledge', 'score': 90},
   {'name': u'Physics', 'score': 86},
   {'name': u'Building and Construction', 'score': 72},
   {'name': u'Mechanical', 'score': 63},
   {'name': u'Production and Processing', 'score': 52},
   {'name': u'Economics and Accountin