In [3]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras.layers import Embedding

In [165]:
#loading datset
data = pd.read_csv('D:/Job_prediction/job_nouns2.csv')
data.head()

Unnamed: 0,sr. no,Description,skills/no-skills
0,1,data analyst,1
1,2,data,1
2,3,needs,0
3,6,internal stakeholders,0
4,7,business problem formulation requirements,0


In [166]:
# assigning values for x and target value y
x = data.Description.iloc[:]
y = data.iloc[:,-1]

In [55]:
print(x.shape)


(9243,)


In [49]:
# initilizing the text tokernizer
'''Tokernizer allows to vectorize a text corpus, by turning each
   text into either a sequence of integers'''

tokenizer = Tokenizer()

#  converting our data to sequence of integers
tokenizer.fit_on_texts(x)

In [119]:
# tokenized words
tokenizer.index_word

{1: 'data',
 2: 'experience',
 3: 'development',
 4: 'software',
 5: 'team',
 6: 'skills',
 7: 'business',
 8: 'work',
 9: 'time',
 10: 'management',
 11: 'ability',
 12: 'solutions',
 13: 'full',
 14: 'technical',
 15: 'knowledge',
 16: 'tools',
 17: 'job',
 18: 'years',
 19: 'systems',
 20: 'new',
 21: 'services',
 22: 'code',
 23: 'health',
 24: 'design',
 25: 'requirements',
 26: 'technology',
 27: 'environment',
 28: 'application',
 29: 'career',
 30: 'cloud',
 31: 'teams',
 32: 'company',
 33: 'quality',
 34: 'toronto',
 35: 'program',
 36: 'strong',
 37: 'process',
 38: 'analysis',
 39: 'support',
 40: 'aws',
 41: 'canada',
 42: 'information',
 43: 'infrastructure',
 44: 'benefits',
 45: 'care',
 46: 'stack',
 47: 'computer',
 48: 'research',
 49: 'products',
 50: 'technologies',
 51: 'engineering',
 52: 'industry',
 53: 'projects',
 54: 'opportunity',
 55: 'developer',
 56: 'performance',
 57: 'practices',
 58: 'processes',
 59: 'analytics',
 60: 'service',
 61: 'best',
 62: 'p

In [50]:
# making sequences for our data
sequences = tokenizer.texts_to_sequences(x)
sequences[:5]

[[1, 84], [1], [95], [112, 100], [7, 96, 1396, 25]]

In [57]:
# trying to predict tokenized sequence for word 'data'
tokenizer.texts_to_sequences(['data'])

[[1]]

In [34]:
len(sequences)

9243

In [40]:
'''pad_sequences transforms a list of
    `num_samples` sequences (lists of integers)
    into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
    `num_timesteps` is either the `maxlen` argument if provided,
    or the length of the longest sequence otherwise'''
    
embedded_x = pad_sequences(sequences)
embedded_x

array([[   0,    0,    0, ...,    0,    1,   84],
       [   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    0,   95],
       ...,
       [   0,    0,    0, ...,    0,    0,   87],
       [   0,    0,    0, ...,    0,    0,   65],
       [   0,    0,    0, ...,  645,  568, 1186]])

In [41]:
embedded_x.shape

(9243, 20)

In [42]:
# Generating LSTM model
'''first layer is an embedding layer. The layer lets the system expand each token to a more massive vector, 
    allowing the network to represent a word in a meaningful way.
    The layer takes 9243 as the first argument, 
    which is our dataset size, and 100 as the second input parameter, which is the dimension of the embedding. 
    The third parameter is the input_length of embedded_x colum values, which is 20,the length of each sequence in embedded_x'''
    
model = Sequential()
model.add(Embedding(len(embedded_x), 100, input_length=embedded_x.shape[1]))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [43]:
# fiting the model
model.fit(embedded_x, np.array(y), validation_split=0.4, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1f2e5cb32b0>

In [78]:
# trying to perdict with word 'data'
model.predict(tokenizer.texts_to_sequences(['data']))



array([[0.89267933]], dtype=float32)

In [89]:
# saving the model
model.save('D:/Job_prediction/skill_prediction.h5')

In [95]:
import pickle

# saving tokenizer
with open('D:/Job_prediction/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
import pickle
from keras_preprocessing.text import tokenizer_from_json

#loading tokenizer
with open('D:/Job_prediction/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)


In [4]:
# loading our model
from keras.models import load_model
model = load_model('D:/Job_prediction/skill_prediction.h5')

In [7]:
from keras_preprocessing.sequence import pad_sequences
def predict(model,tokenizer,word):
    input = pad_sequences(tokenizer.texts_to_sequences([word]),maxlen=20)
    # input = tokenizer.texts_to_sequences([word])
    d = model.predict(input)
    print('probebilty ',d)
    if d <= 0.5:
        d = 'non skill word'
    else:
        d = 'skill word' 
    return d
    

In [8]:
predict(model,tokenizer,'python')

probebilty  [[0.9867021]]


'skill word'

In [9]:
predict(model,tokenizer,'sql')

probebilty  [[0.984691]]


'skill word'

In [10]:
predict(model,tokenizer,'company')

probebilty  [[0.00143993]]


'non skill word'

In [11]:
predict(model,tokenizer,'priyanka')

probebilty  [[0.01861832]]


'non skill word'

In [12]:
predict(model,tokenizer,'')

probebilty  [[0.01861832]]


'non skill word'

In [14]:
predict(model,tokenizer,'coffee')

probebilty  [[0.00767113]]


'non skill word'

In [19]:
predict(model,tokenizer,'data garba')

probebilty  [[0.95231634]]


'skill word'