In [48]:
import category_encoders as ce
import itertools
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import shap
import sqlite3
import sys
import warnings
try:
    import fireducks.pandas as pd
except ImportError:
    import pandas as pd
    pd.set_option("mode.copy_on_write", True)
from category_encoders import TargetEncoder as CE_TargetEncoder
from matplotlib import rcParams
from sklearn.feature_extraction.text import TfidfVectorizer as TfidfVectoriser

In [2]:
warnings.filterwarnings("ignore")

In [3]:
# Monkey patching NumPy for compatibility with version >= 1.24
np.float = np.float64
np.int = np.int_
np.object = np.object_
np.bool = np.bool_

In [4]:
# Define constants
GOLDEN_RATIO = 1.618033989
FIG_WIDTH = 12
FIG_HEIGHT = FIG_WIDTH / GOLDEN_RATIO
FIG_SIZE = (FIG_WIDTH, FIG_HEIGHT)
FIG_DPI = 72
RANDOM_SAMPLE_SIZE = 13
RANDOM_SEED = 42
ALPHA_VALUE = 0.05

In [5]:
# Plotting parameters
rcParams["figure.figsize"] = FIG_SIZE
rcParams["figure.dpi"] = FIG_DPI
rcParams["savefig.format"] = "svg"

In [6]:
df_compact = pd.read_parquet("~/zzz_personal/.assets/data/000_common_dataset/datanerd-jobs-full-dataset-compact-in-parquet.parquet", engine="fastparquet")

In [7]:
df_compact

Unnamed: 0,job_id,job_title,company_name,job_location,requested_skills,field,job_via,job_schedule,remote_job,need_degree,has_insurance,country,yearly_salary
0,0,data analyst,cryptology,anywhere,"excel, tableau, power bi",analyst_tools,via linkedin,full-time,True,False,False,serbia,
1,0,data analyst,cryptology,anywhere,"sql, python",programming,via linkedin,full-time,True,False,False,serbia,
2,1,data analyst,point32health,"west bridgewater, ma","excel, sas",analyst_tools,via adzuna,full-time,False,False,True,united states,
3,1,data analyst,point32health,"west bridgewater, ma","sas, sql",programming,via adzuna,full-time,False,False,True,united states,
4,2,data analyst,apex systems,"naperville, il","power bi, ssis",analyst_tools,via linkedin,full-time,False,False,False,united states,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910586,1826669,data scientist,kth royal institute of technology,"stockholm, sweden",none,none,via trabajo.org,full-time,False,False,False,sweden,
1910587,1826673,data scientist,general dynamics,"bossier city, la","power bi, looker, tableau",analyst_tools,via bebee,full-time,False,False,True,united states,
1910588,1826673,data scientist,general dynamics,"bossier city, la","bigquery, aws",cloud,via bebee,full-time,False,False,True,united states,
1910589,1826673,data scientist,general dynamics,"bossier city, la","c#, java, javascript, python, r, sql",programming,via bebee,full-time,False,False,True,united states,


In [8]:
df_compact.shape

(1910591, 13)

In [9]:
df_compact = df_compact.dropna()

In [10]:
df_compact.shape

(82786, 13)

In [11]:
cat_columns = df_compact.select_dtypes(include=["category", "object", "bool"]).columns.tolist()
num_columns = df_compact.select_dtypes(include=["float32", "float64", "int32", "float64", "int64"]).columns.tolist()

In [12]:
print(cat_columns)
print(num_columns)

['job_title', 'company_name', 'job_location', 'requested_skills', 'field', 'job_via', 'job_schedule', 'remote_job', 'need_degree', 'has_insurance', 'country']
['job_id', 'yearly_salary']


In [13]:
df_tfidf = df_compact.copy()
df_ce = df_compact.copy()

In [14]:
tfidf_vectoriser = TfidfVectoriser(stop_words="english", ngram_range=(1, 3))

In [15]:
df_tfidf

Unnamed: 0,job_id,job_title,company_name,job_location,requested_skills,field,job_via,job_schedule,remote_job,need_degree,has_insurance,country,yearly_salary
170,67,data scientist,upwork,anywhere,"tensorflow, pytorch, scikit-learn",libraries,via upwork,contractor and temp work,True,False,False,united states,176800.0
171,67,data scientist,upwork,anywhere,"python, go, r",programming,via upwork,contractor and temp work,True,False,False,united states,176800.0
172,67,data scientist,upwork,anywhere,zoom,sync,via upwork,contractor and temp work,True,False,False,united states,176800.0
195,76,data engineer,interactive resources - ir,"denver, co","databricks, aws",cloud,via linkedin,contractor,False,False,False,united states,145600.0
196,76,data engineer,interactive resources - ir,"denver, co",pyspark,libraries,via linkedin,contractor,False,False,False,united states,145600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910393,1826345,data scientist,sequoia research corporation,"torrance, ca",linux,os,via indeed,full-time,False,False,True,united states,114500.0
1910394,1826345,data scientist,sequoia research corporation,"torrance, ca","c, fortran, matlab, python",programming,via indeed,full-time,False,False,True,united states,114500.0
1910462,1826455,machine learning engineer,amazon.com,"santa clara, ca",aws,cloud,via ai-jobs.net,full-time,False,False,True,united states,179000.0
1910463,1826455,machine learning engineer,amazon.com,"santa clara, ca",flow,other,via ai-jobs.net,full-time,False,False,True,united states,179000.0


In [16]:
df_tfidf[cat_columns] = df_tfidf[cat_columns].astype(str)

for item in cat_columns:
    transformed_item = tfidf_vectoriser.fit_transform(df_tfidf[item]).toarray()
    df_tfidf[item + "_tfidf"] = list(transformed_item)

In [17]:
df_tfidf

Unnamed: 0,job_id,job_title,company_name,job_location,requested_skills,field,job_via,job_schedule,remote_job,need_degree,...,company_name_tfidf,job_location_tfidf,requested_skills_tfidf,field_tfidf,job_via_tfidf,job_schedule_tfidf,remote_job_tfidf,need_degree_tfidf,has_insurance_tfidf,country_tfidf
170,67,data scientist,upwork,anywhere,"tensorflow, pytorch, scikit-learn",libraries,via upwork,contractor and temp work,True,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.24897259916771652, 0.0, 0.4470672326616327,...","[0.0, 1.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
171,67,data scientist,upwork,anywhere,"python, go, r",programming,via upwork,contractor and temp work,True,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.24897259916771652, 0.0, 0.4470672326616327,...","[0.0, 1.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
172,67,data scientist,upwork,anywhere,zoom,sync,via upwork,contractor and temp work,True,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.24897259916771652, 0.0, 0.4470672326616327,...","[0.0, 1.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
195,76,data engineer,interactive resources - ir,"denver, co","databricks, aws",cloud,via linkedin,contractor,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
196,76,data engineer,interactive resources - ir,"denver, co",pyspark,libraries,via linkedin,contractor,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910393,1826345,data scientist,sequoia research corporation,"torrance, ca",linux,os,via indeed,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1910394,1826345,data scientist,sequoia research corporation,"torrance, ca","c, fortran, matlab, python",programming,via indeed,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1910462,1826455,machine learning engineer,amazon.com,"santa clara, ca",aws,cloud,via ai-jobs.net,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1910463,1826455,machine learning engineer,amazon.com,"santa clara, ca",flow,other,via ai-jobs.net,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [18]:
df_tfidf.sample(RANDOM_SAMPLE_SIZE)

Unnamed: 0,job_id,job_title,company_name,job_location,requested_skills,field,job_via,job_schedule,remote_job,need_degree,...,company_name_tfidf,job_location_tfidf,requested_skills_tfidf,field_tfidf,job_via_tfidf,job_schedule_tfidf,remote_job_tfidf,need_degree_tfidf,has_insurance_tfidf,country_tfidf
1178359,872934,data engineer,capco,"kuala lumpur, federal territory of kuala lumpu...","power bi, tableau",analyst_tools,via ai-jobs.net,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1254317,955852,data scientist,intelletec,anywhere,python,programming,via linkedin,full-time,True,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1586953,1360578,senior data analyst,helen of troy,"arlington, tx",sql,programming,via ladders,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
210811,99497,data engineer,fidelity investments,"durham, nc","power bi, tableau",analyst_tools,via ladders,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
481959,264650,data analyst,clickjobs.io,"manchester, uk",bigquery,cloud,via linkedin,full-time,False,True,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[0.0, 1.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
401048,211328,data engineer,sigma software,"kyiv, ukraine","python, sql",programming,via ai-jobs.net,full-time,False,True,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[0.0, 1.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
397208,208977,data engineer,cvs health,"providence, ri",unix,os,via linkedin,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
267622,130703,data engineer,cvs health,"hartford, ct","python, sql",programming,via linkedin,full-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
786426,496900,senior data scientist,hhs careers,"bridgeton, mo","r, sas",programming,via professional diversity network,full-time and part-time,False,False,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1291658,998086,data analyst,poshmark,"chennai, tamil nadu, india",express,webframeworks,via ai-jobs.net,full-time,False,True,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0]","[0.0, 1.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [19]:
df_tfidf["company_name_tfidf"]

170        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
171        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
172        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
195        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
196        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                                 ...                        
1910393    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1910394    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1910462    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1910463    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1910464    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: company_name_tfidf, Length: 82786, dtype: object

In [20]:
df_tfidf.columns

Index(['job_id', 'job_title', 'company_name', 'job_location',
       'requested_skills', 'field', 'job_via', 'job_schedule', 'remote_job',
       'need_degree', 'has_insurance', 'country', 'yearly_salary',
       'job_title_tfidf', 'company_name_tfidf', 'job_location_tfidf',
       'requested_skills_tfidf', 'field_tfidf', 'job_via_tfidf',
       'job_schedule_tfidf', 'remote_job_tfidf', 'need_degree_tfidf',
       'has_insurance_tfidf', 'country_tfidf'],
      dtype='object')

In [21]:
# Reorder columns side by side with tdidf columns
tfidf_cols = [item + "_tfidf" for item in cat_columns]
other_cols = [item for item in df_tfidf.columns if item not in tfidf_cols]


In [22]:
tfidf_cols

['job_title_tfidf',
 'company_name_tfidf',
 'job_location_tfidf',
 'requested_skills_tfidf',
 'field_tfidf',
 'job_via_tfidf',
 'job_schedule_tfidf',
 'remote_job_tfidf',
 'need_degree_tfidf',
 'has_insurance_tfidf',
 'country_tfidf']

In [23]:
other_cols.remove("job_id")
other_cols.remove("yearly_salary")

In [24]:
other_cols

['job_title',
 'company_name',
 'job_location',
 'requested_skills',
 'field',
 'job_via',
 'job_schedule',
 'remote_job',
 'need_degree',
 'has_insurance',
 'country']

In [25]:
# Adapted from: How do I merge two lists into a single list? https://stackoverflow.com/a/3472069
reordered_cols = [j for i in zip(other_cols, tfidf_cols) for j in i]

In [26]:
reordered_cols

['job_title',
 'job_title_tfidf',
 'company_name',
 'company_name_tfidf',
 'job_location',
 'job_location_tfidf',
 'requested_skills',
 'requested_skills_tfidf',
 'field',
 'field_tfidf',
 'job_via',
 'job_via_tfidf',
 'job_schedule',
 'job_schedule_tfidf',
 'remote_job',
 'remote_job_tfidf',
 'need_degree',
 'need_degree_tfidf',
 'has_insurance',
 'has_insurance_tfidf',
 'country',
 'country_tfidf']

In [27]:
reordered_cols = ["job_id"] + reordered_cols

In [28]:
reordered_cols = reordered_cols + ["yearly_salary"]

In [29]:
reordered_cols

['job_id',
 'job_title',
 'job_title_tfidf',
 'company_name',
 'company_name_tfidf',
 'job_location',
 'job_location_tfidf',
 'requested_skills',
 'requested_skills_tfidf',
 'field',
 'field_tfidf',
 'job_via',
 'job_via_tfidf',
 'job_schedule',
 'job_schedule_tfidf',
 'remote_job',
 'remote_job_tfidf',
 'need_degree',
 'need_degree_tfidf',
 'has_insurance',
 'has_insurance_tfidf',
 'country',
 'country_tfidf',
 'yearly_salary']

In [30]:
df_tfidf = df_tfidf[reordered_cols]

In [31]:
df_tfidf

Unnamed: 0,job_id,job_title,job_title_tfidf,company_name,company_name_tfidf,job_location,job_location_tfidf,requested_skills,requested_skills_tfidf,field,...,job_schedule_tfidf,remote_job,remote_job_tfidf,need_degree,need_degree_tfidf,has_insurance,has_insurance_tfidf,country,country_tfidf,yearly_salary
170,67,data scientist,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.32434641924452245,...",upwork,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",anywhere,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","tensorflow, pytorch, scikit-learn","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",libraries,...,"[0.24897259916771652, 0.0, 0.4470672326616327,...",True,"[0.0, 1.0]",False,"[1.0, 0.0]",False,"[1.0, 0.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",176800.0
171,67,data scientist,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.32434641924452245,...",upwork,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",anywhere,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","python, go, r","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",programming,...,"[0.24897259916771652, 0.0, 0.4470672326616327,...",True,"[0.0, 1.0]",False,"[1.0, 0.0]",False,"[1.0, 0.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",176800.0
172,67,data scientist,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.32434641924452245,...",upwork,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",anywhere,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",zoom,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",sync,...,"[0.24897259916771652, 0.0, 0.4470672326616327,...",True,"[0.0, 1.0]",False,"[1.0, 0.0]",False,"[1.0, 0.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",176800.0
195,76,data engineer,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.35812684896046876,...",interactive resources - ir,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","denver, co","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","databricks, aws","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",cloud,...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[1.0, 0.0]",False,"[1.0, 0.0]",False,"[1.0, 0.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",145600.0
196,76,data engineer,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.35812684896046876,...",interactive resources - ir,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","denver, co","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",pyspark,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",libraries,...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[1.0, 0.0]",False,"[1.0, 0.0]",False,"[1.0, 0.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",145600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910393,1826345,data scientist,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.32434641924452245,...",sequoia research corporation,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","torrance, ca","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",linux,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",os,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[1.0, 0.0]",False,"[1.0, 0.0]",True,"[0.0, 1.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",114500.0
1910394,1826345,data scientist,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.32434641924452245,...",sequoia research corporation,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","torrance, ca","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","c, fortran, matlab, python","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",programming,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[1.0, 0.0]",False,"[1.0, 0.0]",True,"[0.0, 1.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",114500.0
1910462,1826455,machine learning engineer,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",amazon.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","santa clara, ca","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",aws,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",cloud,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[1.0, 0.0]",False,"[1.0, 0.0]",True,"[0.0, 1.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",179000.0
1910463,1826455,machine learning engineer,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",amazon.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","santa clara, ca","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",flow,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",other,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[1.0, 0.0]",False,"[1.0, 0.0]",True,"[0.0, 1.0]",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",179000.0


In [49]:
combined_list = list(zip(other_cols, tfidf_cols))

In [None]:
# From Python - Alternate List elements https://www.geeksforgeeks.org/python/python-alternate-list-elements/
result_list = list(itertools.chain(*combined_list))

In [51]:
print(result_list)

['job_title', 'job_title_tfidf', 'company_name', 'company_name_tfidf', 'job_location', 'job_location_tfidf', 'requested_skills', 'requested_skills_tfidf', 'field', 'field_tfidf', 'job_via', 'job_via_tfidf', 'job_schedule', 'job_schedule_tfidf', 'remote_job', 'remote_job_tfidf', 'need_degree', 'need_degree_tfidf', 'has_insurance', 'has_insurance_tfidf', 'country', 'country_tfidf']


In [32]:
target_encoder = CE_TargetEncoder()

In [33]:
for item in cat_columns:
    df_ce[item + "_cete"] = target_encoder.fit_transform(df_ce[item], df_ce["yearly_salary"])



In [34]:
cete_cols = [item + "_cete" for item in cat_columns]

In [35]:
reordered_cols02 = [j for i in zip(other_cols, cete_cols) for j in i]

In [36]:
reordered_cols02 = ["job_id"] + reordered_cols02

In [37]:
reordered_cols02 = reordered_cols02 + ["yearly_salary"]

In [38]:
reordered_cols02

['job_id',
 'job_title',
 'job_title_cete',
 'company_name',
 'company_name_cete',
 'job_location',
 'job_location_cete',
 'requested_skills',
 'requested_skills_cete',
 'field',
 'field_cete',
 'job_via',
 'job_via_cete',
 'job_schedule',
 'job_schedule_cete',
 'remote_job',
 'remote_job_cete',
 'need_degree',
 'need_degree_cete',
 'has_insurance',
 'has_insurance_cete',
 'country',
 'country_cete',
 'yearly_salary']

In [39]:
df_ce = df_ce[reordered_cols02]

In [None]:
df_ce.sample(RANDOM_SAMPLE_SIZE)

In [None]:
df_tfidf.to_excel("~/zzz_personal/.assets/data/000_common_dataset/datanerd-jobs-ohne-missing-tfidf-encoded.ods", engine="odf", index=False)

In [46]:
df_ce.to_parquet("~/zzz_personal/.assets/data/000_common_dataset/datanerd-jobs-ohne-missing-ce-target-encoded.parquet", engine="fastparquet", index=False)

In [47]:
df_ce.to_parquet("~/zzz_personal/.assets/data/000_common_dataset/datanerd-jobs-ohne-missing-ce-target-encoded.parquet.brotli", engine="fastparquet", index=False, compression="brotli")