In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../Resources/AI_Impact_On_Jobs_2030.csv')

In [3]:
df.head(5) #checking first 5 records to see if data loaded successfully.

Unnamed: 0,Job_Title,Average_Salary,Years_Experience,Education_Level,AI_Exposure_Index,Tech_Growth_Factor,Automation_Probability_2030,Risk_Category,Skill_1,Skill_2,Skill_3,Skill_4,Skill_5,Skill_6,Skill_7,Skill_8,Skill_9,Skill_10
0,Security Guard,45795,28,Master's,0.18,1.28,0.85,High,0.45,0.1,0.46,0.33,0.14,0.65,0.06,0.72,0.94,0.0
1,Research Scientist,133355,20,PhD,0.62,1.11,0.05,Low,0.02,0.52,0.4,0.05,0.97,0.23,0.09,0.62,0.38,0.98
2,Construction Worker,146216,2,High School,0.86,1.18,0.81,High,0.01,0.94,0.56,0.39,0.02,0.23,0.24,0.68,0.61,0.83
3,Software Engineer,136530,13,PhD,0.39,0.68,0.6,Medium,0.43,0.21,0.57,0.03,0.84,0.45,0.4,0.93,0.73,0.33
4,Financial Analyst,70397,22,High School,0.52,1.46,0.64,Medium,0.75,0.54,0.59,0.97,0.61,0.28,0.3,0.17,0.02,0.42


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

In [5]:
bins = [-1, 2, 5, 10, 20, float('inf')]
labels = ['Entry/Junior', 'Early Career', 'Mid-Level', 'Senior', 'Expert/Late']

def add_experience_band(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X['Experience_Band'] = pd.cut(
        X['Years_Experience'],
        bins=bins,
        labels=labels,
        right=False
    )
    X['Experience_Band_Code'] = X['Experience_Band'].cat.codes
    return X

In [6]:
bins2 = [-float('inf'), 30500, 100000, float('inf')]
labels2 = ['Low', 'Middle', 'High']

def add_income_band(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X['Income_Band'] = pd.cut(
        X['Average_Salary'],
        bins=bins2,
        labels=labels2,
        right=False
    )
    
    X['Income_Band_Code'] = X['Income_Band'].cat.codes
    return X

In [7]:
job_sector_map = {
    "Security Guard":      "Security & Protective Services",
    "Research Scientist":  "Research & Development",
    "Construction Worker": "Construction",
    "Software Engineer":   "Digital & Technology",
    "Financial Analyst":   "Finance & Professional Services",
    "AI Engineer":         "Digital & Technology",
    "Mechanic":            "Transport & Automotive Services",
    "Teacher":             "Education",
    "HR Specialist":       "Business & Professional Services",
    "Customer Support":    "Customer Service",
    "UX Researcher":       "Digital & Creative Industries",
    "Lawyer":              "Legal Services",
    "Data Scientist":      "Digital & Technology",
    "Graphic Designer":    "Creative Industries",
    "Retail Worker":       "Retail & Consumer Services",
    "Doctor":              "Health & Social Care",
    "Truck Driver":        "Transport & Logistics",
    "Chef":                "Hospitality & Catering",
    "Nurse":               "Health & Social Care",
    "Marketing Manager":   "Marketing & Professional Services",
}

def add_job_sector(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X['Job_Sector'] = X['Job_Title'].map(job_sector_map)
    X['Job_Sector_Code'], sector_uniques = pd.factorize(X['Job_Sector'])
    return X

In [8]:
labour_group_map = {
    # ** Public_Sector **
    "Teacher":          "Public_Sector",
    "Doctor":           "Public_Sector",
    "Nurse":            "Public_Sector",

    # ** Private_Professional **
    "Financial Analyst":"Private_Professional",
    "HR Specialist":    "Private_Professional",
    "Lawyer":           "Private_Professional",
    "Marketing Manager":"Private_Professional",

    # ** Digital_Creative **
    "Research Scientist":"Digital_Creative",
    "Software Engineer": "Digital_Creative",
    "AI Engineer":       "Digital_Creative",
    "UX Researcher":     "Digital_Creative",
    "Data Scientist":    "Digital_Creative",
    "Graphic Designer":  "Digital_Creative",

    # ** Manual_Trade_Service **
    "Security Guard":    "Manual_Trade_Service",
    "Construction Worker":"Manual_Trade_Service",
    "Mechanic":          "Manual_Trade_Service",
    "Customer Support":  "Manual_Trade_Service",
    "Retail Worker":     "Manual_Trade_Service",
    "Truck Driver":      "Manual_Trade_Service",
    "Chef":              "Manual_Trade_Service",
}

def add_labour_group(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X['Labour_Group'] = X['Job_Title'].map(labour_group_map)
    X['Labour_Group_Code'], group_uniques = pd.factorize(X['Labour_Group'])
    return X

In [9]:
risk_order = [['Low', 'Medium', 'High']]

risk_encoder = OrdinalEncoder(
    categories=risk_order,
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

def encode_risk(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X['Risk_Category_Code'] = risk_encoder.fit_transform(
        X[['Risk_Category']]
    )
    return X

In [10]:
education_order = [["High School", "Bachelor's", "Master's", "PhD"]]

education_encoder = OrdinalEncoder(
    categories = education_order,
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

def encode_education(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X['Education_Level_Code'] = education_encoder.fit_transform(
        X[['Education_Level']]
    )
    return X

In [11]:
def add_job_title_code(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X['Job_Title_Code'], sector_uniques = pd.factorize(X['Job_Title'])
    return X

In [12]:
pipe = Pipeline(steps=[
    ('experience_binner', FunctionTransformer(add_experience_band, validate=False)),
    ('income_binner', FunctionTransformer(add_income_band, validate=False)),
    ('job_sector_mapper', FunctionTransformer(add_job_sector, validate=False)),
    ('labour_group_mapper', FunctionTransformer(add_labour_group, validate=False)),
    ('risk_to_ordinal', FunctionTransformer(encode_risk, validate=False)),
    ('education_to_ordinal', FunctionTransformer(encode_education, validate=False)),
    ('job_title_code_mapper', FunctionTransformer(add_job_title_code, validate=False))
])

In [13]:
df_transformed = pipe.fit_transform(df)

In [14]:
df_transformed.head()

Unnamed: 0,Job_Title,Average_Salary,Years_Experience,Education_Level,AI_Exposure_Index,Tech_Growth_Factor,Automation_Probability_2030,Risk_Category,Skill_1,Skill_2,...,Experience_Band_Code,Income_Band,Income_Band_Code,Job_Sector,Job_Sector_Code,Labour_Group,Labour_Group_Code,Risk_Category_Code,Education_Level_Code,Job_Title_Code
0,Security Guard,45795,28,Master's,0.18,1.28,0.85,High,0.45,0.1,...,4,Middle,1,Security & Protective Services,0,Manual_Trade_Service,0,2.0,2.0,0
1,Research Scientist,133355,20,PhD,0.62,1.11,0.05,Low,0.02,0.52,...,4,High,2,Research & Development,1,Digital_Creative,1,0.0,3.0,1
2,Construction Worker,146216,2,High School,0.86,1.18,0.81,High,0.01,0.94,...,1,High,2,Construction,2,Manual_Trade_Service,0,2.0,0.0,2
3,Software Engineer,136530,13,PhD,0.39,0.68,0.6,Medium,0.43,0.21,...,3,High,2,Digital & Technology,3,Digital_Creative,1,1.0,3.0,3
4,Financial Analyst,70397,22,High School,0.52,1.46,0.64,Medium,0.75,0.54,...,4,Middle,1,Finance & Professional Services,4,Private_Professional,2,1.0,0.0,4


In [15]:
df_transformed[['Experience_Band', 'Experience_Band_Code']].value_counts(sort=True)

Experience_Band  Experience_Band_Code
Expert/Late      4                       1032
Senior           3                        983
Mid-Level        2                        486
Early Career     1                        301
Entry/Junior     0                        198
Name: count, dtype: int64

In [16]:
df_transformed[['Job_Sector','Job_Sector_Code']].value_counts(sort=True)

Job_Sector                         Job_Sector_Code
Digital & Technology               3                  482
Health & Social Care               13                 274
Digital & Creative Industries      9                  167
Education                          6                  166
Creative Industries                11                 160
Business & Professional Services   7                  157
Construction                       2                  156
Retail & Consumer Services         12                 155
Security & Protective Services     0                  154
Transport & Logistics              14                 153
Finance & Professional Services    4                  151
Research & Development             1                  149
Hospitality & Catering             15                 141
Transport & Automotive Services    5                  136
Marketing & Professional Services  16                 134
Customer Service                   8                  133
Legal Services       

In [17]:
df_transformed[['Labour_Group','Labour_Group_Code']].value_counts(sort=True)

Labour_Group          Labour_Group_Code
Manual_Trade_Service  0                    1028
Digital_Creative      1                     958
Private_Professional  2                     574
Public_Sector         3                     440
Name: count, dtype: int64

In [18]:
df_transformed[['Risk_Category','Risk_Category_Code']].value_counts(sort=True)

Risk_Category  Risk_Category_Code
Medium         1.0                   1521
High           2.0                    740
Low            0.0                    739
Name: count, dtype: int64

In [19]:
df_transformed[['Education_Level','Education_Level_Code']].value_counts(sort=True)

Education_Level  Education_Level_Code
High School      0.0                     784
Bachelor's       1.0                     765
Master's         2.0                     735
PhD              3.0                     716
Name: count, dtype: int64

In [23]:
df_transformed[['Job_Title','Job_Title_Code']].value_counts(sort=True)

Job_Title            Job_Title_Code
Software Engineer    3                 175
Data Scientist       12                167
UX Researcher        10                167
Teacher              7                 166
Graphic Designer     13                160
HR Specialist        8                 157
Construction Worker  2                 156
Retail Worker        14                155
Security Guard       0                 154
Truck Driver         16                153
Financial Analyst    4                 151
Research Scientist   1                 149
Nurse                18                142
Chef                 17                141
AI Engineer          5                 140
Mechanic             6                 136
Marketing Manager    19                134
Customer Support     9                 133
Lawyer               11                132
Doctor               15                132
Name: count, dtype: int64

In [20]:
df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   Job_Title                    3000 non-null   object  
 1   Average_Salary               3000 non-null   int64   
 2   Years_Experience             3000 non-null   int64   
 3   Education_Level              3000 non-null   object  
 4   AI_Exposure_Index            3000 non-null   float64 
 5   Tech_Growth_Factor           3000 non-null   float64 
 6   Automation_Probability_2030  3000 non-null   float64 
 7   Risk_Category                3000 non-null   object  
 8   Skill_1                      3000 non-null   float64 
 9   Skill_2                      3000 non-null   float64 
 10  Skill_3                      3000 non-null   float64 
 11  Skill_4                      3000 non-null   float64 
 12  Skill_5                      3000 non-null   float64 
 13  Ski

In [21]:
df_transformed.shape

(3000, 29)

In [22]:
df_transformed.head(3)

Unnamed: 0,Job_Title,Average_Salary,Years_Experience,Education_Level,AI_Exposure_Index,Tech_Growth_Factor,Automation_Probability_2030,Risk_Category,Skill_1,Skill_2,...,Experience_Band_Code,Income_Band,Income_Band_Code,Job_Sector,Job_Sector_Code,Labour_Group,Labour_Group_Code,Risk_Category_Code,Education_Level_Code,Job_Title_Code
0,Security Guard,45795,28,Master's,0.18,1.28,0.85,High,0.45,0.1,...,4,Middle,1,Security & Protective Services,0,Manual_Trade_Service,0,2.0,2.0,0
1,Research Scientist,133355,20,PhD,0.62,1.11,0.05,Low,0.02,0.52,...,4,High,2,Research & Development,1,Digital_Creative,1,0.0,3.0,1
2,Construction Worker,146216,2,High School,0.86,1.18,0.81,High,0.01,0.94,...,1,High,2,Construction,2,Manual_Trade_Service,0,2.0,0.0,2
