# Introduction


This project consists of finding a correlation between job descriptions and skills.

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# from wordcloud import WordCloud

# import matplotlib.pyplot as plt
# %matplotlib inline
# from textblob import Word

# Read data

Let's start by reading this data.

In [None]:
from google.colab import drive

drive.mount('/gdrive')
%cd /gdrive/My\ Drive/Colab\ Notebooks/extract\ skills\ from\ job

In [7]:
test = pd.read_csv('jobs-title-and-description.csv')
## Delete empty rows (In case I missed parsing a row)
test = test.dropna()
print("\n ** raw data **\n")
print(test.head())
print("\n ** data shape **\n")
print(test.shape)


 ** raw data **

                                           job_title  \
0                      Chief Marketing Officer (CMO)   
1                                   Registered Nurse   
2                                   Dental Hygienist   
3                        Senior Salesforce Developer   
4  DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...   

                                         description  
0  Who We're Looking For:\n\nThe Chief Marketing ...  
1  Queens Boulevard Endoscopy Center, an endoscop...  
2  Part-time or Full-timedental hygienist positio...  
3  Principle Duties & Responsibilities:\n\nAnalyz...  
4  For FULL Job Announcement, visit our website: ...  

 ** data shape **

(900, 2)


This data contains job descriptions and is structured into two columns: 

* job_title : for the job title.
* description : raw text describing the job requirements.

Let's now check if our data is balanced and therefore eligible to modeling.

There are approximatively 30 rows for each job.

# Preprocess text data
Since the data we're now working with is at its rawest form, we need to preprocess it before extracting information from it.

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

[nltk_data] Downloading package stopwords to /home/zubair/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zubair/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/zubair/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zubair/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
## Lower case
test['description'] = test['description'].apply(lambda x: " ".join(x.lower()for x in x.split()))
## remove tabulation and punctuation
test['description'] = test['description'].str.replace('[^\w\s]',' ')
## digits
test['description'] = test['description'].str.replace('\d+', '')

#remove stop words
stop = stopwords.words('english')
test['description'] = test['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

## lemmatization
test['description'] = test['description'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

print("Preprocessed data: \n")
print(test.head())

  test['description'] = test['description'].str.replace('[^\w\s]',' ')
  test['description'] = test['description'].str.replace('\d+', '')


NameError: name 'Word' is not defined

# Visualize data
In this step, **we will aggregate our data by job titles** in order to visualy detect the most frequent words for each job.

In [10]:
## jda stands for job description aggregated
jda = test.groupby(['job_title']).sum().reset_index()
print("Aggregated job descriptions: \n")
print(jda)

Aggregated job descriptions: 

                                            job_title  \
0                                       ABA Therapist   
1                       Chief Marketing Officer (CMO)   
2                        Construction Project Manager   
3   DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...   
4                                    Dental Hygienist   
5                                     Diesel Mechanic   
6                       Doctor of Veterinary Medicine   
7                        Emergency Veterinarian - NYC   
8                     Emergency Veterinary Technician   
9                        Experienced A level mechanic   
10                          Forward Deployed Engineer   
11                                       Hair Stylist   
12                          Interested in KWT Global?   
13                         Lab - Medical Technologist   
14                  Lead Pharmacy Technician: Billing   
15                       Mammography Technologist PRN   


In [None]:
# ## Visualize data
# jobs_list = jda.job_title.unique().tolist()
# for job in jobs_list:

#     # Start with one review:
#     text = jda[jda.job_title == job].iloc[0].description
#     # Create and generate a word cloud image:
#     wordcloud = WordCloud().generate(text)
#     print("\n***",job,"***\n")
#     # Display the generated image:
#     plt.imshow(wordcloud, interpolation='bilinear')
#     plt.axis("off")
#     plt.show()

The presence of meaningless words such as: Technology, Organization, Company.
as well as the presence of the job title itself will be safely deleted from our data.

In [None]:
## Delete more stop words
other_stop_words = ['junior', 'senior','experience','etc','job','work','company','technique',
                    'candidate','skill','skills','language','menu','inc','new','plus','years',
                   'technology','organization','ceo','cto','account','manager','data','scientist','mobile',
                    'developer','product','revenue','strong']

test['description'] = test['description'].apply(lambda x: " ".join(x for x in x.split() if x not in other_stop_words))

# Modeling
We are now going to translate this skill-extraction problem into a classification one first.
And then extract the most important features from each class.

The most important features, in this case, represent the words that most likely will belong to a class ( in our case job title) 


 naive bayes algorithm selected for this training.

In [12]:
## Converting text to features 
vectorizer = TfidfVectorizer()
#Tokenize and build vocabulary
X = vectorizer.fit_transform(test.description)
y = test.job_title

# split data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=109) 
print("train data shape: ",X_train.shape)
print("test data shape: ",X_test.shape)

# Fit model
clf = MultinomialNB()
clf.fit(X_train, y_train)
## Predict
y_predicted = clf.predict(X_test)

train data shape:  (720, 2112)
test data shape:  (180, 2112)


## MODEL EVALUATION 

In [13]:
#evaluate the predictions
print("Accuracy score is: ",accuracy_score(y_test, y_predicted))
print("Classes: (to help read Confusion Matrix)\n", clf.classes_)
print("Confusion Matrix: ")

print(confusion_matrix(y_test, y_predicted))
print("Classification Report: ")
print(classification_report(y_test, y_predicted))

Accuracy score is:  0.9555555555555556
Classes: (to help read Confusion Matrix)
 ['ABA Therapist' 'Chief Marketing Officer (CMO)'
 'Construction Project Manager'
 'DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL ADVOCACY'
 'Dental Hygienist' 'Diesel Mechanic' 'Doctor of Veterinary Medicine'
 'Emergency Veterinarian - NYC' 'Emergency Veterinary Technician'
 'Experienced A level mechanic' 'Forward Deployed Engineer' 'Hair Stylist'
 'Interested in KWT Global?' 'Lab - Medical Technologist'
 'Lead Pharmacy Technician: Billing' 'Mammography Technologist PRN'
 'NYS Licensed Psychologist' 'OT/ICS Systems Engineer'
 'Paid Search Director' 'Pest Control Technician' 'Plumber'
 'Principal Incident Response Consultant'
 'Principal, Sr. Consultant – Creative Technologist' 'RN / LPN'
 'RN/LPN (PRN)'
 'Regional Vice President – Partner Development (East Coast)'
 'Registered Nurse' 'Senior Estimator/Project Manager'
 'Senior Salesforce Developer' 'Ultrasound Technologist']
Confusion Matrix: 
[[ 5  0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Feature extraction
Let's now extract the most meaningful features of each class.

To do so, we can access the attribute *feature_log_prob_* from our model which returns the log probability of features given a class.

We will next sort the log probabilies descendingly.

And finally map the most important tokens to the classes


# Output
At this step, we have for each class/job a list of the most representative words/tokens found in job descriptions.

Let's shrink this list of words to only:
* 6 technical skills
* 6 adjectives

To do so, we use the library *TextBlob* to identify adjectives.

Also, given a (non-exhaustive) list of programming languages, we can extract the top technical skills.


In [None]:
from textblob import TextBlob
technical_skills = ['python', 'c','r', 'c++','java','hadoop','scala','flask','pandas','spark','scikit-learn',
                    'numpy','php','sql','mysql','css','mongdb','nltk','fastai' , 'keras', 'pytorch','tensorflow',
                   'linux','Ruby','JavaScript','django','react','reactjs','ai','ui','tableau']
feature_array = vectorizer.get_feature_names()
# number of overall model features
features_numbers = len(feature_array)
## max sorted features number
n_max = int(features_numbers * 0.1)


##initialize output dataframe
output = pd.DataFrame()
for i in range(0,len(clf.classes_)):
    print("\n****" ,clf.classes_[i],"****\n")
    class_prob_indices_sorted = clf.feature_log_prob_[i, :].argsort()[::-1]
    raw_skills = np.take(feature_array, class_prob_indices_sorted[:n_max])
    print("list of unprocessed skills :")
    print(raw_skills)
    
    ## Extract technical skills
    top_technical_skills= list(set(technical_skills).intersection(raw_skills))[:6]
    #print("Top technical skills",top_technical_skills)
    
    ## Extract adjectives
    
    # Delete technical skills from raw skills list
    ## At this steps, raw skills list doesnt contain the technical skills
    #raw_skills = [x for x in raw_skills if x not in top_technical_skills]
    #raw_skills = list(set(raw_skills) - set(top_technical_skills))

    # transform list to string
    txt = " ".join(raw_skills)
    blob = TextBlob(txt)
    #top 6 adjective
    top_adjectives = [w for (w, pos) in TextBlob(txt).pos_tags if pos.startswith("JJ")][:6]
    #print("Top 6 adjectives: ",top_adjectives)
    
    output = output.append({'job_title':clf.classes_[i],
                        'technical_skills':top_technical_skills,
                        'soft_skills':top_adjectives },
                       ignore_index=True)

Correlation between jobs and skills:

In [None]:
print(output.T)