<a href="https://colab.research.google.com/github/usamaeltmsah/Job-Classifier/blob/master/job_title_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
# Install the latest Tensorflow version.sentence
!pip3 install --upgrade tensorflow-gpu
# Install TF-Hub.
!pip3 install tensorflow-hub
!pip3 install seaborn

In [0]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


## Mounting Google Drive locally

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Read all the descriptions from csv file
def read_dataset(file_path):
  names = ['Date', 'Last Update', 'Job Title', 'Job Description']
  data = pd.read_csv(file_path, names = names, header=0)
  return data

## Remove punctuations

In [0]:
def rem_punc(data):
  return data.str.replace('[^\w\s]','')

In [0]:
def rem_punc_from_text(text):
  return re.sub(r'[^\w\s]','',text)

## Removing Stop Words

In [0]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
def rem_stop_words(data):
  return data.apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [0]:
def rem_stop_words_from_text(text):
  return " ".join(text for text in text.split() if text not in stop)

In [0]:
def count_words(sentence):
  return len(re.findall(r'\w+', sentence))

In [0]:
def compare_strings_lengths(s1, s2):
  return count_words(s1) - count_words(s2)

In [0]:
# Return the index of the larger (according two n words) sentence
# If s1 > s2 Return 0 Else return 1
def which_larger(s1, s2):
  return 0 if count_words(s1) > count_words(s2) else 1

In [0]:
def embed(input):
  return model(input)

In [0]:
# Correlation matrix
def calc_corr(features):
  corr = np.inner(features, features)
  return corr

In [0]:
def sentence_embedding(sentences_):
  sentence_embeddings_ = embed(sentences_)
  return sentence_embeddings_

In [0]:
def evaluate_answer(answers, applicant_answer):
  answers.append(applicant_answer)

  sentence_embeddings_ = sentence_embedding(answers)

  corr = calc_corr(sentence_embeddings_)
  return get_best_score(corr)

In [0]:
def evaluate_text(descriptions_embed, in_desc_embed):
  descriptions_embed.append(in_desc_embed)
  # corr = calc_corr(descriptions_embed)
  # return get_best_score(corr)
  # return corr

In [0]:
# @Param corr: the similarity matrix between all the texts
# The last is the applicant answer
def get_best_score(corr):
  length = len(corr)
  in_description_corr = corr[length -1]
  in_description_corr = np.delete(in_description_corr, -1)
  return max(in_description_corr)

## Clean All data

In [0]:
def clean_descriptions(descriptions):
  if(isinstance(descriptions, pd.core.frame.DataFrame)):
    descriptions = rem_stop_words(data['Job Description']) # Remove stop-words from the description
    descriptions = rem_punc(descriptions) # Remove the punctuations
  elif(isinstance(descriptions, str)):
    descriptions = rem_stop_words_from_text(descriptions)
    descriptions = rem_punc_from_text(descriptions)
  else:
    print("Check Datatype can't be: ", type(descriptions))
    return
  return descriptions

## Make N-grams

In [0]:
# Import Library for ngrams
from textblob import TextBlob
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def ngrams(text, n):
  ngrams_word_lists_to_list = []
  ngrams = TextBlob(text).ngrams(n)
  for ngram in ngrams:
    ngrams_word_lists_to_list.append(' '.join(word for word in ngram))
  return ngrams_word_lists_to_list

In [0]:
# Calculate the average for list of integers
def average(lst):
  # print(lst)
  return sum(lst) / len(lst)

In [0]:
from random import randint
def calc_n_dynamic_ngram(descriptions, in_description):
  length = len(descriptions)
  diff = []
  for i in range(5):
    j = randint(0, length-1)
    # if which_larger(descriptions[j], in_description) == 1:
    #   continue
    diff.append(compare_strings_lengths(descriptions[j], in_description))
  # if len(diff) == 3:
  #   print("To big sentence!")
  #   return False
  avg = int(abs(average(diff)))

  return randint(2, avg)

In [0]:
def dynamic_ngram(descriptions, in_description):
  # n = calc_n_dynamic_ngram(descriptions, in_description)
  n = 3
  descriptions_grams = [ngrams(description, n) for description in descriptions]
  in_description_ngram = ngrams(in_description, n)

  return descriptions_grams, in_description_ngram

In [0]:
def classify(all_ngrams, job_titles):
  evaluations = []
  for description_ngram in all_ngrams[0]:
    evaluation_ = []
    for in_description_ngram in all_ngrams[1]:
      evaluation_.append(evaluate_answer(description_ngram, in_description_ngram))
    if len(evaluation_) > 0:
      evaluations.append(average(evaluation_))
    else:
      evaluations.append(0)
      
  ind = evaluations.index(max(evaluations))
  return evaluations.index(max(evaluations)), job_titles[ind]

In [0]:
def save_model(model, filename='/content/drive/My Drive/Colab Notebooks/occupations_dataset/finalized_model.sav'):
  import pickle
  pickle.dump(model, open(filename, 'wb'))

In [0]:
def load_model(filename='/content/drive/My Drive/Colab Notebooks/occupations_dataset/finalized_model.sav'):
  import pickle
  return pickle.load(open(filename, 'rb'))

In [0]:
descriptions = []
in_description = input("Enter the description: ")

# print(evaluate_answer(descriptions, in_description))

Enter the description: powerful, open source object-relational database system with over 30 years of active development that has earned


In [0]:
occup_file = "/content/drive/My Drive/Colab Notebooks/occupations_dataset/npo_occupations.csv"
data = read_dataset(occup_file)
descriptions = clean_descriptions(data)
in_description = clean_descriptions(in_description)
# descriptions = data.iloc[:,3]
descriptions = descriptions.values.tolist()

In [0]:
all_ngrams = dynamic_ngram(descriptions, in_description)

In [0]:
# all_ngrams = load_model()

In [0]:
classify(all_ngrams, data['Job Title'])

(1444, 'Database Architects ')

In [0]:
def main():
  # descriptions = []
  # in_description = input("Enter the description: ")

  # occup_file = "/content/drive/My Drive/Colab Notebooks/occupations_dataset/npo_occupations.csv"
  # data = read_dataset(occup_file)
  # descriptions = clean_descriptions(data)
  # in_description = clean_descriptions(in_description)
  # # descriptions = data.iloc[:,3]
  # descriptions = descriptions.values.tolist()
  all_ngrams = dynamic_ngram(descriptions, in_description)
  predicted_job = classify(all_ngrams, data['Job Title'])

In [0]:
if __name__ == "__main__":
  main()

NameError: ignored

In [0]:
n = 3
desc_ngrams = [ngrams(desc, n) for desc in descriptions]

In [0]:
desc_embeddings = [sentence_embedding(desc_ngram) for desc_ngram in desc_ngrams]
in_desc_embed = sentence_embedding([in_description])

In [0]:
evaluate_text(desc_embeddings, in_desc_embed[0])

ValueError: ignored

In [0]:
in_desc_embed[0]

<tf.Tensor: shape=(512,), dtype=float32, numpy=
array([ 5.97083010e-02,  1.66217722e-02, -4.53656502e-02, -4.11200412e-02,
        5.22415154e-02,  1.62055120e-02,  4.47978824e-03,  1.09153399e-02,
        5.69913760e-02, -7.25785941e-02, -2.15205383e-02, -5.06714359e-03,
       -3.72533090e-02,  2.54637655e-02, -5.27620763e-02, -2.63341647e-02,
       -3.50272134e-02,  3.45307924e-02,  1.73696037e-02,  1.63036864e-02,
       -5.74379861e-02,  3.06069795e-02, -1.23732984e-02,  2.39667371e-02,
       -6.54200837e-02,  1.47247836e-02, -6.23083524e-02, -2.43534241e-02,
       -1.39863193e-02,  1.04058243e-01,  4.96279597e-02, -2.92044673e-02,
        4.11120653e-02,  7.14798868e-02, -5.92965148e-02,  1.37531413e-02,
        8.54870602e-02,  3.41858156e-02, -2.39310018e-03, -5.83407516e-03,
       -7.51781464e-02,  3.42067406e-02,  6.79537728e-02, -9.30743366e-02,
       -3.93298902e-02,  1.97378341e-02,  2.84429826e-02, -1.93067491e-02,
       -5.70718832e-02,  5.29325269e-02,  3.16250660