In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import ast 
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
# from nltk.stem.snowball import SnowballStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
# from nltk.corpus import wordnet
# from surprise import Reader, Dataset, SVD, evaluate
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

from uszipcode import SearchEngine
from  geopy.distance import distance


MAX_DISTANCE = 15

import warnings; warnings.simplefilter('ignore')



In [2]:
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [3]:

#!dir .\Data\*.tsv

In [None]:
folder = 'Data'

apps = pd.read_csv('./'+folder+'/apps.tsv', delimiter='\t',encoding='utf-8')
user_history = pd.read_csv('./'+folder+'/user_history.tsv', delimiter='\t',encoding='utf-8')
jobs = pd.read_csv('./'+folder+'/jobs.tsv', delimiter='\t',encoding='utf-8', error_bad_lines=False)
users = pd.read_csv('./'+folder+'/users.tsv' ,delimiter='\t',encoding='utf-8')
test_users = pd.read_csv('./'+folder+'/test_users.tsv', delimiter='\t',encoding='utf-8')


b'Skipping line 122433: expected 11 fields, saw 12\n'
b'Skipping line 602576: expected 11 fields, saw 12\n'
b'Skipping line 990950: expected 11 fields, saw 12\n'


### EDA

In [None]:
apps.head()


In [None]:
user_history.head()

In [None]:
jobs.head()

In [None]:
users.head()

In [None]:
test_users.head()
len(users)

In [None]:
users = users.loc[users.State == 'NY']
len(users)

In [None]:
users = users.sample(frac=0.2, replace=False, random_state=1)
len(users)

#### Subsetting jobs in NY

In [None]:
jobs = jobs.loc[jobs.State == 'NY']

#jobs.to_csv("NYjobs.tsv",  sep='\t',encoding='utf-8')

#### Subsetting jobs in Zip

In [None]:
jobs.Zip5 = jobs.Zip5.fillna(0)
jobs.Zip5 = jobs.Zip5.astype(int)
jobs = jobs.loc[jobs['Zip5'] != 0]

In [None]:

jobs = jobs.sample(frac=0.2, replace=False, random_state=1)
len(jobs)

In [None]:
import gc
gc.collect()

In [None]:

jobs.groupby(['Zip5', 'City']).size().reset_index(name='Count').sort_values('Count', ascending=False).head()

### Preprocessing

### Preprocessing Description and Requirements

In [None]:
import re

def preprocessor(text):
    text = text.replace('\\r', '').replace('&nbsp', '').replace('\n', '')
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [None]:

jobs['Description'] = jobs['Description'].astype(dtype='str').apply(preprocessor)

In [None]:
jobs['Requirements'] = jobs['Requirements'].astype(dtype='str').apply(preprocessor)

### Creating jobs coordinates

In [None]:
search = SearchEngine(simple_zipcode=True)

In [None]:
#lat, "long"
"""coords_1 = c
coords_2 = (34.1, -118.42)

print (round(distance(coords_1, coords_2).miles, 2))"""
search.by_zipcode("2e").lat

In [None]:

def coordinates(zipcode):
    zipcode = search.by_zipcode(zipcode)
    community = zipcode.post_office_city
    return community, "{},{}".format(zipcode.lat, zipcode.lng)


In [None]:
%%timeit
jobs["Community"] = ""
jobs["Coordinates"] = ""
for zipcode in jobs.Zip5.unique():
    community, coordinate = coordinates(zipcode)
    jobs.loc[jobs.Zip5 == zipcode, "Coordinates"] =  str(coordinate)
    jobs.loc[jobs.Zip5 == zipcode, "Community"] =  community

In [None]:
jobs.groupby(['Community']).size().reset_index(name='Count').sort_values('Count', ascending=False).head(20)

In [None]:
jobs.groupby(['Community', 'Zip5', 'Coordinates']).size().reset_index(name='Count').sort_values('Count', ascending=False).head(20)

### creating user coordinates

In [None]:

def creating_coordinates(df, column):
    df["Community"] = ""
    df["Coordinates"] = "None, None"
    for zipcode in df[column].unique():
        community, coordinate = coordinates(zipcode)
        df.loc[df[column] == zipcode, "Coordinates"] =  str(coordinate)
        df.loc[df[column] == zipcode, "Community"] =  community
    return df
users = creating_coordinates(users, 'ZipCode')

In [None]:
users.groupby(['Community']).size().reset_index(name='Count').sort_values('Count', ascending=False).head(20)

### Distance Matrix 

In [None]:
users.head(2)

In [None]:
jobs.head(2)

In [None]:
apps.head(2)

In [None]:
user_history.head(2)

### Subset most applied jobs by title

### Looking similar jobs

In [None]:
current_app_intercep = np.intersect1d(users.UserID.unique(), apps.UserID.unique())
historical_app_intercep = np.intersect1d(users.UserID.unique(), user_history.UserID.unique())
total_users = users.UserID.unique()
with_info =  set(current_app_intercep.tolist() + historical_app_intercep.tolist())

current_app = 1-len(current_app_intercep )/len(total_users )
historical_app = 1-len(historical_app_intercep)/ len(total_users )
cold_star_p = len(with_info)/len(total_users )

print(current_app, historical_app, cold_star_p)


In [None]:
#distance_jobs_df = pd.DataFrame(columns=jobs.JobID.unique(), index = users.index)

### Looks for most applied jobs id

### Distance matrix

In [95]:


#distance_jobs_df = pd.DataFrame(columns=jobs.JobID.unique(), index = users.index)

In [96]:
#distance_jobs_df.to_csv()

### Recomender top k liked

In [None]:
def user_exist(user):
    if len(users.loc[users['UserID'] == user]) == 0:
        return False
    return True

def has_coordinates(user):
    if len(users.loc[users['UserID'] == user, "Coordinates"]) == 0:
        return False
    return True
    

In [None]:
unique = 0.3 #update name
top = 20

def ranking_by_popularity(top):
    
    popular_jobs = user_history.groupby(
    ['JobTitle']).size().reset_index(
    name='Count').sort_values('Count', ascending=False)
    ranking =  dict()
    top_i = 0
    
    while True:
        job_title = popular_jobs['JobTitle'].iloc[top_i]
        jobs_list = jobs.loc[jobs['Title'] == job_title, ['JobID']]['JobID'].unique().tolist()

        if len(jobs_list) > 1:
            ranking[job_title] = jobs_list

        if len(ranking) == top:
            break

        top_i +=1
    return ranking

ranking_popular = ranking_by_popularity(top)

In [None]:

def recommender_popular_jobs(user, unique, top):
    
    recommended_popular_jobs = dict()
    c1 = users.loc[users['UserID'] == user, 'Coordinates']
    
    if user_exist(user) and has_coordinates(user):
        for title, jobs_list in ranking_popular.items():
            
            distances = dict()
            
            for job in jobs_list:
                
                c2 = jobs.loc[jobs['JobID'] == job, 'Coordinates']
                if c2.iloc[0].split(',')[0] == 'None':
                    continue
                distances[job] = round(distance(c1, c2).miles, 2)
            
            distances = sorted(distances.items(), key=lambda kv: kv[1])
            closest = distances[0]
            if closest[1] >= MAX_DISTANCE:
                continue
            recommended_popular_jobs[title] = (closest[0], closest[1])
            
    else:
        for title, jobs_list in ranking_popular.items():
            recommended_popular_jobs[title] = jobs_list[0]
            
    return recommended_popular_jobs

recommender_popular_jobs(98, unique, top)

### Content Based title x profile



In [None]:
jobs['profile'] = jobs['Title'].astype(str)  +  '. ' + jobs['Requirements'].astype(str) +  '. ' + jobs['Description'].astype(str) 

In [None]:
%%timeit

users['profile'] = (users['DegreeType'].astype(str) + " ")*5 + (users['Major'].astype(str) +  " " )*5
#+ " Years of Experience " + users['TotalYearsExperience'].astype(str)
#+ str(p.number_to_words(users["TotalYearsExperience"])) + " Years of Experience " + 


##### Garbage words

In [None]:
users['profile'] = users['profile'].str.replace('None.', '.')
users['profile'] = users['profile'].str.replace('Not Applicable', '.')
users['profile'] = users['profile'].str.replace(' nan', ' ') 

users['profile'].iloc[1]

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

import pickle

simple_preprocess(users['profile'].iloc[1])

In [None]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
        self.epoch += 1
        

In [None]:
jobs['Coordinates']

In [None]:
#If user has coordinates
#if user has info then similarties if not popularity ranker


def content_distance_based_recommender(user_id, jobID_mapping = jobID_mapping, model =  model, top = 10):
    #As infer_vector produce stochastics result I made a for to save the best list
    user_profile = np.array(users.loc[users['UserID'] == user_id, 'profile'])[0]
    historical_apps = user_history.loc[user_history.UserID == user_id, 'JobTitle']
    for application in historical_apps:
        user_profile += ". " + (str(application) + " ")*10
    user_profile = simple_preprocess(user_profile)
    
    best = 0
    top10=pd.DataFrame(index = range(top), columns = ['JobID', 'Title', 'Distance', 'Description', 'Requirements'])
    c1 = users.loc[users['UserID'] == user_id, 'Coordinates']
    job_distance_list = list()
    for i in range (5):
        inferred_vector = model.infer_vector(user_profile)
        sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
        sum_results = 0
        count = 0
        total_recom = 0
        job_distance_list = list()
        
        while True:
            
            c2 = jobs.loc[jobs['JobID'] == sims[count][0], 'Coordinates']
            count +=1
            if len(c2) == 0:
                print("Empy")
                continue
            
            if c2.iloc[0].split(',')[0] == 'None':
                print('None')
                continue
                
            job_distance = round(distance(c1, c2).miles, 2)
            
            if job_distance >= MAX_DISTANCE:
                print('Distance')
                continue

            sum_results+=sims[count][1]
            total_recom +=1
            job_distance_list.append(job_distance)
            if total_recom == top:
                break
        
        ##Best simulation
        if sum_results > best:
            best = sum_results
            best_sim = sims
            job_distance_list2 = job_distance_list
        print(count)

            
            
    for i in range(top):
        recomendation = jobID_mapping[best_sim[i][0]]
        top10.iloc[i]['JobID', 'Title',  'Description', 'Requirements'] = np.array(jobs.loc[jobs['JobID'] == recomendation][['JobID', 'Title', 
                                                               'Description', 'Requirements']])[0]
        top10.iloc[i]['Distance'] = job_distance_list2[i]
    
    return top10, sims    


t, s = content_distance_based_recommender(user_id, top=15)    
        

In [1070]:
t

Unnamed: 0,JobID,Title,Distance,Description,Requirements
0,1080071,ADMINISTRATIVE/CLERICAL,13.28,colating,
1,244585,Assistant Produce Manager,1.9,know how to put the super in supermarket at ha...,here is that you will need to bring to the pos...
2,549641,ADMINISTRATIVE/CLERICAL,2.07,colating,
3,563550,Executive Administrator,1.38,,
4,1101142,Licensing Associate-Hiring Event 6/6!!,1.38,we are currently looking for a licensing assoc...,four year degree in business administration f...
5,897408,Manager - Employee Relations,1.38,the employee relations manager serves as a key...,bachelor s or graduate degree in human resour...
6,1114193,Internal Auditor,2.07,internal auditor top woodbury firm seeks indiv...,minimum 3 years experience
7,731174,Mortgage Support: 2nd Shift 3-12,9.81,ost professional a division of open systems te...,if you meet the following qualifications pleas...
8,1051392,Financial Sales Professional- Entry Level or E...,0.69,join axa ndash the axa group is listed in the...,job requirements individuals who excel at axa ...
9,708253,Risk Management Account Management - NY,0.52,large national company is seeking an account m...,


In [1038]:
%%time
def similarities_nlp_model( model_name = "jobs_doc2vec_model_3",
                 mapping_name = "jobID_mapping_3.p", max_epochs = 100,
                 alpha = 0.025):
    
    document = list()
    jobID_mapping = dict()
        
    for i, token in enumerate(jobs['profile']):
        value = jobs.iloc[i]["JobID"]
        tokens = TaggedDocument(simple_preprocess(token), [i])
        document.append(tokens)
        jobID_mapping[i] = value
        print(i)

    epoch_logger = EpochLogger()
    model = Doc2Vec(size = 20, alpha=alpha, 
                    min_alpha=0.00025, min_count=1,
                    callbacks=[epoch_logger], dm =1, workers=8
                   )
    
    model.build_vocab(document)
    
    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(document, 
                    total_examples=model.corpus_count, 
                    epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha

    # create a dictionary
    pickle.dump(model, open(model_name, "wb")) 
    pickle.dump(jobID_mapping, open(mapping_name, "wb")) 

# -------------------------------------------------------------
# Load the dictionary back from the pickle file.
    return model, jobID_mapping

model, jobID_mapping = similarities_nlp_model()
jobID_mapping = pickle.load(open("jobID_mapping.p", "rb"))
model  = pickle.load(open("jobs_doc2vec_model", "rb"))

Epoch #0 start
Epoch #1 start
Epoch #2 start
Epoch #3 start
Epoch #4 start
Wall time: 20.6 s


In [1035]:
jobID_mapping = pickle.load(open("jobID_mapping.p", "rb"))
model  = pickle.load(open("jobs_doc2vec_model", "rb"))

<gensim.models.doc2vec.Doc2Vec at 0x278ac508c88>

In [1042]:
#jobID_mapping = pickle.load(open("jobID_mapping.p", "rb"))
#model  = pickle.load(open("jobs_doc2vec_model", "rb"))

def content_based_recommender(user_id, jobID_mapping = jobID_mapping, model =  model, top = 100):
    #As infer_vector produce stochastics result I made a for to save the best list
    user_profile = np.array(users.loc[users['UserID'] == user_id, 'profile'])[0]
    historical_apps = user_history.loc[user_history.UserID == user_id, 'JobTitle']
    for application in historical_apps:
        user_profile += ". " + (str(application) + " ")*1
    user_profile = simple_preprocess(user_profile)
    
    best = 0
    top10=pd.DataFrame(index = range(top), columns = ['JobID', 'Title', 'Description', 'Requirements'])
    
    
    for i in range (100):
        inferred_vector = model.infer_vector(user_profile, topn=len(model.docvecs))
        sims = model.docvecs.most_similar([inferred_vector])
        sum_results = 0
        
        for i in range(top):
            sum_results+=sims[i][1]
            
        if sum_results > best:
            best = sum_results
            best_sim = sims
            
            
    for i in range(top):
        recomendation = jobID_mapping[best_sim[i][0]]
        top10.iloc[i] = np.array(jobs.loc[jobs['JobID'] == recomendation][['JobID', 'Title', 
                                                               'Description', 'Requirements']])[0]
    
    
    return top10, sims


In [1041]:
user_id = 248008
t, s = content_based_recommender(user_id, top=10)
print(user_profile(user_id))

"""248008,
 1184529,
 1122774,
 1165147,
 875025,
 932674,
 563339,
 1433717,
 944873,
 153286,"""


    Degree Type: None
    Major: Electrical Engineering
    Previous Applications:
    
Compliance Associate / Temporary Consultant
Compliance Associate/ Administrative Specialist


'248008,\n 1184529,\n 1122774,\n 1165147,\n 875025,\n 932674,\n 563339,\n 1433717,\n 944873,\n 153286,'

In [948]:
t.Title

0                     ADMINISTRATIVE/CLERICAL
1                     ADMINISTRATIVE/CLERICAL
2                               GENERAL LABOR
3                                          GL
4                                          GL
5                                          GL
6                                          GL
7                                          GL
8    MEDICAL ASSISTANT / MEDICAL RECEPTIONIST
9                     Executive Administrator
Name: Title, dtype: object

In [950]:
t.Description


0                                             colating
1                                             colating
2                                                 none
3    westfield ny seasonal openings all shifts with...
4    westfield ny seasonal openings all shifts with...
5    westfield ny seasonal openings all shifts with...
6    westfield ny seasonal openings all shifts with...
7    westfield ny seasonal openings all shifts with...
8    medical assistant medical receptionistmedical ...
9                                                     
Name: Description, dtype: object

In [1021]:
s

[(10091, 0.6514291763305664),
 (4365, 0.6238051652908325),
 (10528, 0.6194992065429688),
 (10625, 0.6180371642112732),
 (2716, 0.6179522275924683),
 (723, 0.6152012348175049),
 (4445, 0.5981876850128174),
 (8527, 0.5963398218154907),
 (1772, 0.5838823318481445),
 (3529, 0.5803802013397217)]

In [860]:
def user_profile(user_id):
    user_profile = np.array(users.loc[users['UserID'] == user_id, ['DegreeType', 'Major']])[0]
    historical_apps = user_history.loc[user_history.UserID == user_id, 'JobTitle']
    user_profile = """
    Degree Type: {}
    Major: {}
    Previous Applications:
    """.format(user_profile[0], user_profile[1])
    
    for application in historical_apps:
        user_profile += "\n" + str(application)
        
    return user_profile

#### User to User
#### Distance Recommender



### Testing