In [1]:
# usual imports
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#trying something new - for greater accuracy(?) or ordinal regression 
from mord import LogisticAT

# warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../Resources/AI_Impact_On_Jobs_2030_ml_ready.csv')

In [3]:
numerical_cols = df.select_dtypes("number")
numerical_cols

Unnamed: 0,Average_Salary,Years_Experience,AI_Exposure_Index,Tech_Growth_Factor,Automation_Probability_2030,Skill_1,Skill_2,Skill_3,Skill_4,Skill_5,...,Skill_8,Skill_9,Skill_10,Income_Band_Code,Experience_Band_Code,Job_Sector_Code,Labour_Group_Code,Risk_Category_Code,Job_Title_Code,Education_Level_Code
0,45795,28,0.18,1.28,0.85,0.45,0.10,0.46,0.33,0.14,...,0.72,0.94,0.00,1,4,0,0,2,0,2
1,133355,20,0.62,1.11,0.05,0.02,0.52,0.40,0.05,0.97,...,0.62,0.38,0.98,2,4,1,1,0,1,3
2,146216,2,0.86,1.18,0.81,0.01,0.94,0.56,0.39,0.02,...,0.68,0.61,0.83,2,1,2,0,2,2,0
3,136530,13,0.39,0.68,0.60,0.43,0.21,0.57,0.03,0.84,...,0.93,0.73,0.33,2,3,3,1,1,3,3
4,70397,22,0.52,1.46,0.64,0.75,0.54,0.59,0.97,0.61,...,0.17,0.02,0.42,1,4,4,2,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,111319,6,0.24,1.18,0.20,0.73,0.37,0.99,0.07,0.08,...,0.33,0.76,0.45,2,2,13,3,0,15,1
2996,44363,29,0.65,0.74,0.35,0.23,0.48,0.05,0.88,0.56,...,0.80,0.61,0.20,1,4,9,1,1,10,3
2997,61325,23,0.64,0.94,0.39,0.28,0.62,0.73,0.21,0.96,...,0.29,0.48,0.57,1,4,3,1,1,12,2
2998,110296,7,0.95,1.23,0.46,0.21,0.18,0.14,0.22,0.55,...,0.55,0.34,0.70,2,2,11,1,1,13,3


In [4]:
numerical_cols.columns

Index(['Average_Salary', 'Years_Experience', 'AI_Exposure_Index',
       'Tech_Growth_Factor', 'Automation_Probability_2030', 'Skill_1',
       'Skill_2', 'Skill_3', 'Skill_4', 'Skill_5', 'Skill_6', 'Skill_7',
       'Skill_8', 'Skill_9', 'Skill_10', 'Income_Band_Code',
       'Experience_Band_Code', 'Job_Sector_Code', 'Labour_Group_Code',
       'Risk_Category_Code', 'Job_Title_Code', 'Education_Level_Code'],
      dtype='object')

In [6]:
num_cols = numerical_cols.columns.tolist()
num_cols

['Average_Salary',
 'Years_Experience',
 'AI_Exposure_Index',
 'Tech_Growth_Factor',
 'Automation_Probability_2030',
 'Skill_1',
 'Skill_2',
 'Skill_3',
 'Skill_4',
 'Skill_5',
 'Skill_6',
 'Skill_7',
 'Skill_8',
 'Skill_9',
 'Skill_10',
 'Income_Band_Code',
 'Experience_Band_Code',
 'Job_Sector_Code',
 'Labour_Group_Code',
 'Risk_Category_Code',
 'Job_Title_Code',
 'Education_Level_Code']

In [8]:
correlation_num = df[num_cols].corr()
correlation_num

Unnamed: 0,Average_Salary,Years_Experience,AI_Exposure_Index,Tech_Growth_Factor,Automation_Probability_2030,Skill_1,Skill_2,Skill_3,Skill_4,Skill_5,...,Skill_8,Skill_9,Skill_10,Income_Band_Code,Experience_Band_Code,Job_Sector_Code,Labour_Group_Code,Risk_Category_Code,Job_Title_Code,Education_Level_Code
Average_Salary,1.0,0.017389,-0.020196,-0.002214,-0.013367,-0.016888,-0.021382,0.019271,-0.024862,-0.010809,...,-0.005487,-0.00712,0.041655,0.856137,0.013763,-0.005124,0.010852,-0.018352,-0.001627,0.014944
Years_Experience,0.017389,1.0,0.034599,-0.023371,-0.017676,0.011118,0.009353,-0.016043,-0.025986,-0.001974,...,0.03074,-0.028956,0.011798,0.010033,0.929644,-0.000877,-0.004061,-0.00911,-0.010619,0.039526
AI_Exposure_Index,-0.020196,0.034599,1.0,0.016485,0.01432,-0.005675,0.018888,-0.011234,0.022386,-0.010878,...,-0.007744,0.003171,0.004656,-0.004479,0.030403,0.034845,-0.002447,0.017068,0.038892,0.01526
Tech_Growth_Factor,-0.002214,-0.023371,0.016485,1.0,0.025538,0.001253,0.004276,0.007915,-0.013198,-0.017557,...,-0.011194,0.017333,-0.033288,-0.003536,-0.021744,0.046149,-0.012402,0.02267,0.032509,-0.011959
Automation_Probability_2030,-0.013367,-0.017676,0.01432,0.025538,1.0,0.030947,0.024247,-0.020198,0.005963,-0.026349,...,-0.016452,0.001269,0.002665,0.009266,-0.025274,0.008532,-0.686357,0.924843,-0.046918,-0.01407
Skill_1,-0.016888,0.011118,-0.005675,0.001253,0.030947,1.0,-0.015902,0.038025,0.00286,-0.007998,...,-0.027314,-0.005368,-0.014475,-0.024661,0.010517,-0.022948,-0.014042,0.031474,-0.029351,0.013055
Skill_2,-0.021382,0.009353,0.018888,0.004276,0.024247,-0.015902,1.0,-0.038053,0.006365,-0.023542,...,0.00848,0.027924,-0.018727,-0.008108,0.008983,-0.014058,-0.007194,0.018234,-0.013872,-0.013727
Skill_3,0.019271,-0.016043,-0.011234,0.007915,-0.020198,0.038025,-0.038053,1.0,0.006107,-0.026576,...,0.007096,-0.020222,0.004567,0.014646,-0.012488,-0.011298,-0.005387,-0.016169,-0.008172,-0.00058
Skill_4,-0.024862,-0.025986,0.022386,-0.013198,0.005963,0.00286,0.006365,0.006107,1.0,-0.005616,...,-0.011918,-0.036873,0.013784,-0.029203,-0.019132,-0.013014,-0.016686,-0.00644,-0.005375,0.02192
Skill_5,-0.010809,-0.001974,-0.010878,-0.017557,-0.026349,-0.007998,-0.023542,-0.026576,-0.005616,1.0,...,0.006939,0.044185,-0.026335,0.006692,0.010055,-0.002984,-0.004075,-0.019554,0.005296,0.027368


In [9]:
corr_values = correlation_num['Automation_Probability_2030'].sort_values(ascending=False)[1:]
corr_values

Risk_Category_Code      0.924843
Skill_1                 0.030947
Tech_Growth_Factor      0.025538
Skill_2                 0.024247
AI_Exposure_Index       0.014320
Skill_6                 0.011701
Income_Band_Code        0.009266
Job_Sector_Code         0.008532
Skill_4                 0.005963
Skill_10                0.002665
Skill_9                 0.001269
Skill_7                -0.010226
Average_Salary         -0.013367
Education_Level_Code   -0.014070
Skill_8                -0.016452
Years_Experience       -0.017676
Skill_3                -0.020198
Experience_Band_Code   -0.025274
Skill_5                -0.026349
Job_Title_Code         -0.046918
Labour_Group_Code      -0.686357
Name: Automation_Probability_2030, dtype: float64