In [1]:
import numpy as np
import pandas as pd
import gensim
import gensim.corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy 
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)

In [2]:
df = pd.read_csv("Only_Description_Sentences.csv")["EditedJobDescription"]
df

0        {"description": ["Selected candidates work wid...
1        {"description": ["core organization supports p...
2        {"description": ["As Senior Process Controls E...
3        {"description": ["Performs trade studies model...
4        {"description": ["This position must meet Expo...
                               ...                        
18530    {"description": ["Dynetics wholly owned subsid...
18531    {"description": ["Siemens Healthineers continu...
18532    {"description": ["The EngineerOperator respons...
18533    {"description": ["Pyramid Global Hospitality o...
18534    {"description": ["Must bring least 2 years equ...
Name: EditedJobDescription, Length: 18535, dtype: object

In [3]:
print(df[0][0:90])

{"description": ["Selected candidates work wide range programs provide state art Guidance 


In [4]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        # Convert non-string input to string
        if not isinstance(text, str):
            text = str(text)
        
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return texts_out

lemmatized_texts = lemmatization(df)
print(lemmatized_texts[0][:90])

description selected candidate work wide range program provide state art performance analy


In [5]:
custom_stop_words = ["description", "responsibilities", "responsible", "responsibility", "requirements", "required", "require", "requirement", "duties", "to", "the", "engineering", "engineer", "engineers", "job", "perform", "performs", "position", "may", "team", "work", "include","including", "assigned", "experience"]

def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        # Exclude words that are in the words_to_avoid set
        new = [word for word in new if word not in custom_stop_words]
        final.append(new)
    return final

data_words = gen_words(lemmatized_texts)

print(data_words[0][:20])

['selected', 'candidate', 'wide', 'range', 'program', 'provide', 'state', 'art', 'performance', 'analysis', 'simulation', 'solution', 'customer', 'mission', 'trajectory', 'ensure', 'traceability', 'quality', 'system', 'level']


In [6]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0][0:20])

['selected', 'candidate', 'wide_range', 'program', 'provide', 'state_art', 'performance', 'analysis', 'simulation', 'solution', 'customer', 'mission', 'trajectory', 'ensure', 'traceability', 'quality', 'system', 'level', 'component', 'level']


In [7]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = gensim.corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [9]:
# id2word = gensim.corpora.Dictionary(data_words)

# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

# print (corpus[0][0:20])

# word = id2word[[0][:1][0]]
# print (word)

In [21]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [11]:
# test_doc = corpus[-1]

# vector = lda_model[test_doc]
# print (vector)

# def Sort(sub_li):
#     sub_li.sort(key = lambda x: x[1])
#     sub_li.reverse()
#     return (sub_li)
# new_vector = Sort(vector)
# print (new_vector)

In [22]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds = "mmds")
vis

