In [69]:
import re
import nltk
import pandas as pd
from tqdm import tqdm
from icecream import ic

In [70]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    """
    Preprocess a list of strings.
    :parameter
        :param text: string - name of column containing text
        :param flg_stemm: bool - whether stemming is to be applied
        :param flg_lemm: bool - whether lemmitization is to be applied
        :param lst_stopwords: list - list of stopwords to remove
    :return
        cleaned list of strings
    """
    ## clean (convert to lowercase and remove punctuations and
    # characters and then strip)
    phrases = []
    for phrase in tqdm(text):
        phrase = re.sub(r'[^\w\s]', '', str(phrase).lower().strip())

        ## Tokenize (convert from string to list)
        lst_text = phrase.split()
        ## remove Stopwords
        if lst_stopwords is not None:
            lst_text = [word for word in lst_text if word not in
                        lst_stopwords]

        ## Stemming (remove -ing, -ly, ...)
        if flg_stemm:
            ps = nltk.stem.porter.PorterStemmer()
            lst_text = [ps.stem(word) for word in lst_text]

        ## Lemmatization (convert the word into root word)
        if flg_lemm:
            lem = nltk.stem.wordnet.WordNetLemmatizer()
            lst_text = [lem.lemmatize(word) for word in lst_text]

        ## back to string from list
        phrase = " ".join(lst_text)
        phrases.append(phrase.split())
    return phrases

In [85]:
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zackj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\zackj\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [72]:
from nltk.corpus import stopwords
good_list = pd.read_csv('data/Example_Technical_Skills.csv', header=1).to_numpy()
raw_list = pd.read_csv('data/Raw_Skills_Dataset.csv', header=1).to_numpy()

stop_words = stopwords.words('english')
good_list_p = preprocess_text(good_list, True, lst_stopwords=stop_words)
raw_list_p = preprocess_text(raw_list, True, lst_stopwords=stop_words)

100%|██████████| 978/978 [00:00<00:00, 10302.62it/s]
100%|██████████| 34115/34115 [00:02<00:00, 15893.58it/s]


In [86]:
ic(len(raw_list_p), len(raw_list))

ic| len(raw_list_p): 34115, len(raw_list): 34115


(34115, 34115)

In [73]:
ic(good_list[0], good_list_p[0])
ic(raw_list[0], raw_list_p[0])

ic| good_list[0]: array(['Oracle Instance Management & Strategy'], dtype=object)
    good_list_p[0]: ['oracl', 'instanc', 'manag', 'strategi']
ic| raw_list[0]: array(['seniority'], dtype=object)
    raw_list_p[0]: ['senior']


(array(['seniority'], dtype=object), ['senior'])

In [74]:
def clean_data(data, text_filter):
    """
    This function takes preprocessed (cleaned, stemmed, and lemmatized) data and filters it,
    keeping track of the index of the words similar to those in the filter.
    :parameter
        :param data: 2D List of preprocessed word phrases to be filtered
        :param text_filter: 2D List of acceptable rooted word phrases (each word in a phrase is a root)
    :return:
        1D List of indices that passed the filter
    """
    indices = []
    break_flag = False
    for i in tqdm(range(len(data))):
        for word in data[i]: # for 'words' in ['words', 'in', 'phrase']
            for j, phrase in enumerate(text_filter): # for ['phrases', 'in', 'the''] in [['phrases', 'in', 'the''], ['list', 'of'], ['tech', 'skills']]
                if word in phrase: # if 'words' in ['phrases', 'in', 'the']
                    indices.append(i)
                    break_flag = True
                    break
            if break_flag:
                break_flag = False
                break
    return indices

tech_skill_indices = clean_data(raw_list_p, good_list_p)

100%|██████████| 34115/34115 [00:03<00:00, 10728.01it/s]


In [75]:
print(len(tech_skill_indices))
cleaned_list = raw_list[tech_skill_indices]

20110


In [84]:
CSD_df = pd.DataFrame(cleaned_list, columns=['Extracted Skills'])
CSD_df.to_csv('data/Cleaned_Skills_Dataset.csv', index=False)