In [143]:
import csv
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

In [144]:
system_message = """
You are a tagging assistant categorizing skills based on a predefined taxonomy for use in a technical skill tagging system. Each technical skill given should be evaluated against 18 skill categories, indicating its relevance to each category. Expect the given technical skill to fall into one of the following categories:

1. **Programming Languages**: Languages used for coding and scripting, such as C, C++, Python, etc.
2. **Frameworks and Libraries**: Pre-written collections of code used as a foundation for applications.
3. **Development Tools and IDEs**: Software and environments for coding, debugging, and managing code.
4. **Cloud Platforms and Services**: Platforms offering cloud computing and storage capabilities.
5. **DevOps Tools**: Tools for continuous integration, deployment, and automation.
6. **Network and Security Concepts**: Knowledge and tools associated with computer networking and cybersecurity.
7. **Machine Learning and AI Concepts**: Algorithms, techniques, and tools for AI and ML.
8. **Data Science and Analytics**: Skills focused on data manipulation, analysis, and visualization.
9. **Database Systems and Technologies**: Systems and languages for database management and storage.
10. **Quality Assurance and Testing**: Tools and methodologies for software testing and QA.
11. **Project Management Methodologies**: Practices for organizing, planning, and managing projects.
12. **UI and UX Design**: Skills for designing user interfaces and user experiences.
13. **Systems Architecture**: Skills for designing and structuring software or network systems.
14. **Soft Skills**: Personal and interpersonal skills for workplace effectiveness.

Given this technical skill: "{skill}", output a single line of comma-separated values where each value represents the skill’s relevance to a category as follows:

1: The skill is a primary skill for the category.
2: The skill is a secondary skill for the category.
0: The skill is not relevant to the category.

Categories:
Evaluate each skill across the following 20 categories in this order: "software development, web development, mobile development, cloud computing, devops, networking, cybersecurity, data science, machine learning, data engineering, system administration, quality assurance, project management, ui design, systems architecture, game development, soft skills, certifications, compliance"

ONLY OUTPUT VALUES 0-2 BASED ON RELEVANCY. Do NOT output any other number. Do NOT output code. Do NOT ask the user for any further confirmations. Do NOT give any other explanation. ONLY MATCH the given technical skill: "{skill}" to the categories given above. ONLY output the line of comma-separated relevancy values per skill.
"""

headers = [
    "skill name",
    "software development",
    "web development",
    "mobile development",
    "cloud computing",
    "devops",
    "networking",
    "cybersecurity",
    "data science",
    "machine learning",
    "data engineering",
    "system administration"
    "quality assurance",
    "project management",
    "ui design",
    "systems architecture",
    "game development",
    "soft skills",
    "certifications",
    "compliance",
]

In [145]:
prompt = ChatPromptTemplate.from_template(system_message)
model = OllamaLLM(model="llama3.2")
tagging_chain = prompt | model

In [None]:
def get_tagged_string(skill_name, num_tags=20):
    sanity_checked = False
    response = ""

    # Loop until we have a valid response of the correct length and content
    while not sanity_checked or len(response.split(",")) != num_tags:
        response = tagging_chain.invoke({"skill": skill_name})
        value = response.split(",")

        # Check if the response has the correct number of tags and only valid values
        if len(value) == num_tags and all(int(item) in [0, 1, 2] for item in value):
            sanity_checked = True
        else:
            sanity_checked = False

    print(response)
    tagged_string = response.strip()
    return tagged_string


# Function to add a skill's tags to the CSV file
def append_skill_to_csv(skill_name, csv_filename="skills_with_tags.csv"):
    # Get the model's output as a tagged string
    tagged_string = get_tagged_string(skill_name, len(headers))

    # Combine the skill name with the tagged output
    skill_row = [skill_name] + tagged_string.split(",")

    # Append to the CSV file
    with open(csv_filename, mode="a", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(skill_row)


def initialize_csv(csv_filename="skills_with_tags.csv"):
    with open(csv_filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(headers)

In [147]:
# Example usage
initialize_csv()  # Run only once to create and initialize the file
with open('baseline_taxonomies/skills.txt', 'r') as f:
    skills = f.readlines()
    skills = [x.strip() for x in skills]

for skill in skills:
    append_skill_to_csv(skill)






























































































































































































































































































































































































































































































































































































































































































































































































































































































































In [148]:
import pandas as pd

# df = pd.read_csv('skills_with_tags.csv')
for idx, row in df.iterrows():
    # print(row['skill name'])
    for item in row[1:]:
        if item != 0 and item != 1 and item != 2:
            print(row['skill name'])
            break

business intelligence tools
cakephp
data center management
database maintenance
database management system (dbms)
mariadb
microsoft dynamics nav
microsoft power bi
nhibernate
nuget
oracle database
oracle sql developer
presto
puppet (software)
pyspark
sharepoint administration
spritekit
tableau
airtable
bigquery
clickhouse
cloud storage
cockroachdb
couch db
data center architecture
device drivers
dynamodb
elixir
erlang
ibm db2
lightning ai
matlab
microsoft sql server
neo4j
pandas
prolog
qt
qt creator
scons
tidb
unity 3d
unreal engine
xamarin
