In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('baseline_taxonomies/survey_results_public.csv')

In [8]:
import re
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are available
nltk.download("punkt")
nltk.download("wordnet")

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()


# Function to get part of speech for accurate lemmatization
def get_wordnet_pos(word):
    """Map POS tag to first character accepted by WordNetLemmatizer."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }
    return tag_dict.get(tag, wordnet.NOUN)


def process_skill(s, lemmatize=True):
    # Remove special characters and extra whitespace
    s = s.lower()
    s = re.sub(r"[^A-Za-z0-9\s]", "", s)  # Keep only alphanumeric and spaces
    s = s.strip()  # Remove leading/trailing spaces

    # Tokenize and lemmatize (if lemmatize is True)
    if lemmatize:
        tokens = nltk.word_tokenize(s)
        lemmatized_tokens = [
            lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens
        ]
        s = " ".join(lemmatized_tokens)  # Join tokens back into a single string

    return s

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leanh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leanh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
with open('baseline_taxonomies/take_columns.txt', 'r') as f:
    take_columns = f.readlines()
    for i in range(len(take_columns)):
        take_columns[i] = take_columns[i].strip()

In [None]:
skills_set = set()
for idx, row in df.iterrows():
    for col in take_columns:
        if row[col] == 'NA':
            continue
        else:
            skills = str(row[col]).split(';')
            for skill in skills:
                skills_set.add(skill)

In [None]:
lidf = pd.read_csv("baseline_taxonomies/linkedinskills.csv")
for idx, row in lidf.iterrows():
    role = row["Field"]
    skills = row["Skills"].split(",")
    for skill in skills:
        skills_set.add(skill)
    skills_set.add(role)

In [12]:
with open("baseline_taxonomies/skills.txt", "w") as f:
    for item in sorted(list(skills_set)):
        f.write("%s\n" % item.lower())