In [2]:
# This notebook is intended to perform semantic analysis of e.g. job descriptions scraped from postings on the web.
# The output is a dimensionality reduced vector of nodes which can be used to compare and match jobs
# The idea is largely based on Semantic hashing by Ruslan Salakhutdinov, Geoffrey Hinton in the Journal of
# Approximate Reasoning, (2009)
# www.thescinder.com


In [3]:
#OUTLINE

# 0. Import necessary packages and define functions for backpropagation and unsupervised learning with RBMs.  
# 1. Scrape the text from job descriptions based on a list of urls, defined by the user
# 2. Convert the job descriptions text into bag of words vectors based on the word frequencies
# 3. Train restricted Boltzmann machines on the bag-of-words vectors
# 4. Using the pre-trained RBM layers as the starting weights to a neural network, train the NN to reproduce the 
# bag of words vectors via backpropagation


In [93]:
# 0. Import necessary packages and define functions for backpropagation and unsupervised learning with RBMs.  

# Import numpy, natch
import numpy as np
# For tic-toc funcitonality
import time

# Set up plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# For getting websites 
import urllib.request
import urllib
from urllib.error import HTTPError

from bs4 import BeautifulSoup as bs
#help(urllib.request)

In [5]:
# define Neural Network sub-functions

def sigmoid(z):
    #Returns the logistic of the value z
    mySig = 1 / (1+np.exp(-z))
    return mySig

def sigmoidGradient(z):
    #return the gradient of a sigmoid function at value z
    mySigGrad = sigmoid(z)*(1-sigmoid(z))
    return mySigGrad

def hidToVis(rbmW,hidStates):
    visProb = np.dot(rbmW.T,hidStates)
    visProb = sigmoid(visProb)
    return visProb

def visToHid(rbmW,visStates):
    hidProb = np.dot(rbmW,visStates)
    hidProb = sigmoid(hidProb)
    return hidProb

def initRBMW(hLayer,vLayer,mySeed=1.0):
    np.random.seed(mySeed)
    rbmW = np.random.random((hLayer,vLayer))
    return rbmW

def sampPRand(myInput,seed=1):
    #Compare input to pseudo-random variables
    myTest = myInput > np.random.random(np.shape(myInput))
    return myTest * 1

def myGoodness(rbmW,hidStates,visStates):
    #m = np.shape(visStates)[1]
    E = - np.mean(np.dot(np.dot(rbmW, visStates).T, hidStates));
    G = -(E); 
    return G
    
def myGoodnessGrad(hidStates,visStates):
    m = np.shape(visStates)[1];
    myGG = np.dot(visStates,hidStates.T)
    myGG = myGG.T/ m;
    return myGG

def trainRBMLayers(a0,hiddenLayers,lR,myIter):
    #Train an RBM layer based on visible input layer a0
    #a0 - visible units
    #hiddenLayers - number of hidden layers
    #lR - learning rate
    #myIter - number of iterations to train
    myTest = a0
    J = []
    rbmW = initRBMW(hiddenLayers,np.shape(myTest)[0],1)
    for j in range(myIter):
        myTest = sampPRand(a0)
        myHid = visToHid(rbmW,myTest)
        myHid0 = myHid
        #print(myHid)
        myHid = sampPRand(myHid)
        #print(myHid[:,10])
        myDream = hidToVis(rbmW,myHid)
        E = (a0-myDream)
        J.append(np.mean(np.abs(E)))
        myDream = sampPRand(myDream)
        myReconProb = visToHid(rbmW,myDream)
        myRecon = sampPRand(myReconProb)
        myPos = myGoodnessGrad(myHid,myTest)

        myNeg = myGoodnessGrad(myRecon,myDream)
        rbmW = rbmW + lR* (myPos-myNeg)
        if ( j % (myIter/10) == 0):
            G = myGoodness(rbmW,myHid,myTest)
            print("Iteration " + str(j)+" Error = " + str(np.mean(np.abs(E))))
            print("Goodness = " + str(G))
            
    plt.plot(J)
    plt.show()
    print("Finished with RBM training of size " + str(np.shape(rbmW.T)))
    return rbmW.T

#forward propagation 
def forProp(myInputs,myWeights):
    #print(np.shape(myInputs))
    #print(np.shape(myWeights[0]))
    zn = [] #np.dot(myInputs,myWeights[0])
    an = [] #sigmoid(zn)
    if(0):
        for n in range(len(myWeights)):
            zn.append(np.zeros((      np.shape(myWeights[n][1])[0],np.shape(a0)[0])))
            an.append(np.zeros((      np.shape(myWeights[n][1])[0],np.shape(a0)[0])))
        
    
    #zn[0,:] = np.array([np.dot(myInputs,myWeights[0])])
    #an[0,:] = sigmoid(zn[0,:])
    if(0):
        zn[0] = np.dot(myInputs.T,myWeights[0])
        an[0] = sigmoid(zn[0])
    zn.append(np.squeeze(np.dot(myInputs.T,myWeights[0])))
    #print(np.shape(zn))
    an.append(sigmoid(zn[0]))
    #print(np.shape(zn[0]))
    for n in range(1,len(myWeights)):
        #print(np.shape(an))
        #print(np.shape(myWeights[n]))
        if(0):
            print(n)
            zn[n] = np.dot(an[n-1],myWeights[n])
            an[n] = sigmoid(zn[n])
        zn.append(np.dot(an[n-1],myWeights[n]))
        an.append(sigmoid(zn[n]))
        #print(np.shape(an[n]))
        #print(np.shape(an))
    #print(np.shape(an[0]))
    #print(np.shape(np.squeeze(an)))
    return an,zn



#back propagation function

def backProp(myInputs,myCrossInputs,myTarget,myWeights,myIter=10,lR=1e-5,myLambda=0,myMom=1e-5,dropout=False):
    #init momentum
    momSpeed = []
    #init weight penalties
    wPen = []
    #init gradients
    dGrad = []
    Delta = []
    for n in range(len(myWeights)): #-1,-1,-1):
        wPen.append(0*myWeights[n])
        dGrad.append(0*myWeights[n])
        momSpeed.append(0*myWeights[n])
        Delta.append(0*myWeights[n])
        #print(np.shape(wPen[n]))
        #print(np.shape(dGrad[n]))
        #print(np.shape(momSpeed[n]))
    m = m = np.shape(myInputs)[1]
   
    myFreq = int(myIter/10)
  
    print("Begin Training . . . ")
    for i in range(myIter):
        #Run forward propagation.
        myOutput, myZ = forProp(myInputs,myWeights)
        #print(np.shape(myOutput))
        #print(np.shape(myZ[0]))
        #use squared error as objective function
        E = (myTarget.T-myOutput[len(myOutput)-1])
        d = []
        #print(np.shape(E.T))
        d.append(E.T)
        #print(np.shape(d[0]))
        #d.append(E.T)
       

        if(i%myFreq == 0):
            print("Iteration " + str(i) + " Mean error = "+str(np.mean(np.abs(E))))


        for n in range(len(myWeights)-1,-1,-1):
            #
            d.append(np.dot(myWeights[n],d[len(d)-1]) * sigmoidGradient(myZ[n-1].T))
           

        for n in (range(len(myWeights)-1,-1,-1)):
           
            for i in range(m-1):
                #
                Delta[n] =Delta[n] + np.dot(np.array([myOutput[n-1][i,:]]).T,np.array([d[len(d)-(n+2)][:,i]]))
               
        
        for n in (range(len(myWeights)-1,-1,-1)):
            wPen[n] = myLambda * myWeights[n]

        for n in (range(len(myWeights)-1,-1,-1)):
            #print(n)
            dGrad[n] = Delta[n]/m +wPen[n]
            momSpeed[n] = myMom*momSpeed[n] + dGrad[n]
            
            myWeights[n] = myWeights[n] + momSpeed[n] * lR
            #print(np.mean(Theta3))
        #print("Training Finished, avg error = "+str(np.mean(np.abs(E))))
        #print(E)
    return myWeights

In [108]:
# Get the job posting and convert it to text
myAddys = ['https://careers.teamio.net/ixonos/detail/?id=b5e00fa5-487c-49c4-8e0a-239bde830c25&rps=254']
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*6C4FD5DE5BA681FD')
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*507F3DB1ABD68410')
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*1041FEACD2D84A0C')
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*AB19909338AB7990')
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*C357A2E3CF4A1817')
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*4CC524F488C64F42')
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*B679386F7151134D')
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*DC4E3E5A8B43F5C3')
myAddys.append('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*C37F066CC77D70E3')
myAddys.append('http://careers.intuit.com/job-category/7/data/job/00126566/data-scientist?src=JB-10116&utm_source=indeed&utm_medium=jb')
myAddys.append('https://jobs.lever.co/matterport/179dcc62-57ec-40d2-a984-19ae8b8e0d50?lever-source=indeed')
myAddys.append('https://jobs.apple.com/search?job=53180989&openJobId=53180989#&openJobId=53180989')
myAddys.append('https://jobs.apple.com/search?job=52186263&openJobId=52186263#&openJobId=52186263')
myAddys.append('https://jobs.lever.co/faradayfuture/c7c94e42-a29b-443e-874a-5d3827d038c8')
myAddys.append('https://jobs.apple.com/search?job=55915685&openJobId=55915685#&openJobId=55915685')
myAddys.append('https://jobs.apple.com/search?job=55250105&openJobId=55250105#&openJobId=55250105')
myAddys.append('https://abbvie.taleo.net/careersection/2/jobdetail.ftl?lang=en&job=138021&src=JB-11040')
myAddys.append('https://app.trinethire.com/companies/1710-beyond-meat/jobs/4112-scientist?source=indeed&utm_source=Indeed&utm_medium=organic&utm_campaign=Indeed')
myAddys.append('http://jobs.jobvite.com/careers/myokardia/job/ognN4fwe?__jvst=Job%20Board&__jvsd=Indeed')
myAddys.append('http://jobs.thermofisher.com/ShowJob/Id/9969/Scientist%20I,%20Cell%20Biology')
myAddys.append('http://jobs.qb3.org/jobdetail.php?jobid=593325')
myAddys.append('https://jobs.apple.com/search?job=29861569&openJobId=29861569#&openJobId=29861569')
myAddys.append('https://jobs.apple.com/search?job=55107714&openJobId=55107714#&openJobId=55107714')
myAddys.append('https://jobs.lever.co/boostedboards/ffb19c79-5412-48c0-a925-67c25455e80d')

myText = ''
for addy in myAddys:
    print(addy)
    #myURL = urllib.request.urlopen(addy)
    req = urllib.request.Request(url=addy,headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
    try:
        handler = urllib.request.urlopen(req)
    except HTTPError as e:
        content = e.read()
    myURL = handler #urllib.request.urlopen(myAddy)

    myTest = myURL.read()
    soup = bs(myTest, 'html.parser')
    myText = myText + soup.get_text()
    

https://careers.teamio.net/ixonos/detail/?id=b5e00fa5-487c-49c4-8e0a-239bde830c25&rps=254
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*6C4FD5DE5BA681FD
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*507F3DB1ABD68410
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*1041FEACD2D84A0C
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*AB19909338AB7990
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*C357A2E3CF4A1817
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*4CC524F488C64F42
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*B679386F7151134D
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*DC4E3E5A8B43F5C3
https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*C37F066CC77D70E3
http://careers.intuit.com/job-category/7/data/job/00126566/data-scientist?src=JB-10116&utm_source=indeed&utm_medium=jb
https://jobs.lever.co/matterport/179dcc62-57ec-40d2-a984-19ae8

In [132]:
# Create a dictionary bag-of-words for 
wordcount = {}
weirdChar = ['[','{',']','}',':',';','%','/','/','.filename','-','+','=']



for word in myText.split():
    # Check for weird character
    dontAdd = 1
    for wChar in weirdChar:
        if(word.find(wChar) != -1):
            wordcount = wordcount
            dontAdd = 0
            #print('weird character detected')
    # Add word to wordcount dictionary if it seems to be a real weird. 
    if (dontAdd):
        if word not in wordcount:
            wordcount[word] = 1
        else:
            wordcount[word] += 1
print(len(wordcount))

if(0):
    for k, v in wordcount.items():
        print(k,v)

# Build bag-of-words Dictionary based on the combined job descriptions
bowDict = {}
for word in wordcount:
    bowDict[word] = 0  

3777


In [117]:
myAddy.append()

True

In [20]:
# Break the text into a description and requirements section, if possible
myDescLoc = myText.find('Description')
if (myDescLoc == -1):
    #print('lowercase')
    myDescLoc = myText.find('description')
elif (myDescLoc == -1):
    print('Failed to find job description')
        
myReqLoc = myText.find('Requirements')
if (myReqLoc == -1):
    #print('lowercase')
    myReqLoc = myText.find('requirements')
elif (myReqLoc == -1):
    print('Failed to find job requirements')
    
myDesc = myText[myDescLoc:myReqLoc]
myReq = myText[myReqLoc:len(myText)]
#print(myText[myReq:len(myText)])

In [72]:
wordcount = {}
for word in myText.split():
    if word not in wordcount:
        wordcount[word] = 1
    else:
        wordcount[word] += 1
len(wordcount)

    

1793

In [97]:
# Get the job posting and convert it to text
#myURL = urllib.request.urlopen('https://rn11.ultipro.com/SPA1004/JobBoard/JobDetails.aspx?__ID=*1041FEACD2D84A0C')
if (1):
    myAddy = 'https://www.indeed.com/q-Neural-Networks-l-California-jobs.html'
    #myAddy = 'https://www.indeed.com/jobs?q=deep+learning&l=California'
    myAddy = 'https://www.indeed.com/jobs?q=engineer&l=california'
    
    req = urllib.request.Request(url=myAddy,headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
    try:
        handler = urllib.request.urlopen(req)
    except HTTPError as e:
        content = e.read()
    myURL = handler #urllib.request.urlopen(myAddy)

    myTest = myURL.read()
    soup = bs(myTest, 'html.parser')
    myText = soup.get_text()
myLinks = []
for link in soup.find_all('a'):
    if link.get('href'):
        #print(link.get('href'))
        if '/rc/' in link.get('href'):
            myLinks.append('https://www.indeed.com' + link.get('href'))
#print(myLinks)
#print(len(myLinks))
myText = ''
for joblink in myLinks:
    print(joblink)
    req = urllib.request.Request(url=joblink,headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
    try:
        handler = urllib.request.urlopen(req)
    except HTTPError as e:
        content = e.read()
    myURL = handler
    myReadURL = myURL.read()
    soup = bs(myReadURL,'html.parser')
    myTextTemp = soup.get_text()
    myDescLoc = myTextTemp.find('Description')
    print('Found Description')
    if (myDescLoc == -1):
        myDescLoc = myTextTemp.find('description')
    elif (myDescLoc == -1):
        print('Failed to find job description')
    else:
        print('Found description')
        
    myText = myText + myTextTemp[myDescLoc:len(myTextTemp)]
    
len(myText)
        

https://www.indeed.com/rc/clk?jk=71a14402cf0752c1&fccid=8cd9473095cc9d56
Found Description
https://www.indeed.com/rc/clk?jk=6d9525033e54c05e&fccid=11619ce0d3c2c733
Found Description
Found description
https://www.indeed.com/rc/clk?jk=d0a5d3785913b495&fccid=9e051e1a77153a8e
Found Description
https://www.indeed.com/rc/clk?jk=596f17f5e28445f4&fccid=ed6e14306a80ebb2


URLError: <urlopen error EOF occurred in violation of protocol (_ssl.c:645)>

In [101]:
myTest = ['cat']
myTest.append('dog')
print(myTest)

['cat', 'dog']
