## Cardinality

Now I want to deal with cardinality in the column jobs_category

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/cleaned/jobs_in_data_merged.csv')
df.head()

Unnamed: 0,work_year,job_title,job_category,employee_residence,experience_level,employment_type,work_setting,company_location,company_size,salary_in_euros,cost_of_living,purchasing_power
0,2023,Data DevOps Engineer,Data Engineering,Germany,Mid-level,Full-time,Hybrid,Germany,L,87411,127.47,685.74
1,2023,Data Architect,Data Architecture and Modeling,United States,Senior,Full-time,In-person,United States,M,171120,143.34,1193.8
2,2023,Data Architect,Data Architecture and Modeling,United States,Senior,Full-time,In-person,United States,M,75256,143.34,525.02
3,2023,Data Scientist,Data Science and Research,United States,Senior,Full-time,In-person,United States,M,195040,143.34,1360.68
4,2023,Data Scientist,Data Science and Research,United States,Senior,Full-time,In-person,United States,M,85836,143.34,598.83


In [3]:
df['job_title'].unique()

array(['Data DevOps Engineer', 'Data Architect', 'Data Scientist',
       'Machine Learning Researcher', 'Data Engineer',
       'Machine Learning Engineer', 'Data Analyst', 'Analytics Engineer',
       'Applied Scientist', 'BI Developer',
       'Business Intelligence Engineer', 'Research Scientist',
       'Research Analyst', 'Research Engineer', 'Data Science Engineer',
       'Data Quality Analyst', 'Data Product Manager',
       'Machine Learning Scientist', 'AI Engineer', 'MLOps Engineer',
       'Deep Learning Engineer', 'Data Modeler', 'Data Product Owner',
       'Data Science Consultant', 'Business Intelligence Analyst',
       'AI Developer', 'Data Manager', 'ML Engineer',
       'Data Science Director', 'Head of Data', 'BI Analyst',
       'Data Management Analyst', 'Machine Learning Modeler',
       'Data Specialist', 'BI Data Analyst', 'Data Integration Engineer',
       'Business Intelligence Manager', 'Data Integration Specialist',
       'Data Science Practitioner', 'B

In [4]:
df['job_category'].value_counts()

job_category
Data Science and Research         1651
Data Engineering                  1157
Machine Learning and AI            914
Data Analysis                      808
Leadership and Management          351
BI and Visualization               188
Data Architecture and Modeling     162
Data Management and Strategy        49
Data Quality and Operations         45
Cloud and Database                   5
Name: count, dtype: int64

In [5]:
jobs_dict = {
    'Data Science and Research': 'Data Science',
    'Data Engineering': 'Data Engineering',
    'Machine Learning and AI': 'Data Science',
    'Data Analysis': 'Data Analysis',
    'Leadership and Management': 'Leadership and Management',
    'BI and Visualization': 'Data Analysis',
    'Data Architecture and Modeling': 'Data Engineering',
    'Data Management and Strategy': 'Data Engineering',
    'Data Quality and Operations': 'Data Engineering',
    'Cloud and Database': 'Data Engineering'
}

df['job_field'] = df['job_category'].map(jobs_dict)

In [6]:
df['job_field'].value_counts()

job_field
Data Science                 2565
Data Engineering             1418
Data Analysis                 996
Leadership and Management     351
Name: count, dtype: int64

In [7]:
jobs_dict_2 = {
    "Data Science": ['Science', 'Scientist', 'AI', 'ML', 'Machine Learning', 'Deep Learning', 'Model'],
    'Data Engineering' : ['Engineer', 'Engineering', 'Architect', 'DB', 'Database', 'Azure', 'ETL'],
    'Data Analysis'  : ['Analyst', 'Analysis', 'Analytic', 'Business', 'BI'],
}

In [8]:
def assign_job_field(job_title):
    for key, values in jobs_dict_2.items():
        if any(word in job_title for word in values):
            return key
    return 'Other'

In [9]:
df['job_field'] = df.apply(
    lambda x: assign_job_field(x['job_title']) if x['job_category'] == 'Leadership and Management' else x['job_field'], 
    axis=1
)

In [10]:
df['job_field'].value_counts()

job_field
Data Science        2565
Data Engineering    1631
Data Analysis       1029
Other                105
Name: count, dtype: int64

In [13]:
df.to_csv('../data/cleaned/jobs_in_data_cardinality.csv', index=False)