In [54]:
# Import Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [55]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [56]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(salary):
    return salary * 1.03

df_salary['salary_year_inflated']= df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [57]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [58]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.03

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [59]:
import ast

In [60]:
def clean_list(skill_list):
        if pd.notna(skill_list):
         return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list)

In [62]:
type(df['job_skills'][0])

NoneType

In [67]:
import ast

def parse_job_skills(x):
    # If already list → return
    if isinstance(x, list):
        return x
    
    # If NaN/None → return empty list
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    
    # If string → try to parse
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return []
    
    # Fallback
    return []

df['job_skills'] = df['job_skills'].apply(parse_job_skills)

In [68]:
df['job_skills'].head(10).tolist()

[[],
 ['r', 'python', 'sql', 'nosql', 'power bi', 'tableau'],
 ['python',
  'sql',
  'c#',
  'azure',
  'airflow',
  'dax',
  'docker',
  'kubernetes',
  'jenkins'],
 ['python', 'c++', 'java', 'matlab', 'aws', 'tensorflow', 'keras', 'pytorch'],
 ['bash',
  'python',
  'oracle',
  'aws',
  'ansible',
  'puppet',
  'jenkins',
  'gitlab',
  'git'],
 ['python', 'sql', 'gcp'],
 ['sql', 'python', 'java', 'sql server', 'gcp', 'bigquery', 'hadoop'],
 ['sql',
  'nosql',
  'gcp',
  'azure',
  'aws',
  'bigquery',
  'databricks',
  'redshift',
  'airflow',
  'kafka',
  'spark'],
 ['excel', 'powerpoint', 'power bi'],
 ['sql',
  'python',
  'r',
  'mongodb',
  'mongodb',
  'sql server',
  'azure',
  'pandas',
  'spark',
  'windows',
  'excel']]

In [69]:
df['job_skills'] = (
    df['job_skills']
    .apply(lambda lst: [s.lower().strip() for s in lst] if isinstance(lst, list) else [])
)

In [70]:
df['job_skills'].head()

0                                                   []
1           [r, python, sql, nosql, power bi, tableau]
2    [python, sql, c#, azure, airflow, dax, docker,...
3    [python, c++, java, matlab, aws, tensorflow, k...
4    [bash, python, oracle, aws, ansible, puppet, j...
Name: job_skills, dtype: object

In [72]:
df_salary['salary_year_avg'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,116168.5500,112785.00
77,148526.0000,144200.00
92,127308.0000,123600.00
100,242120.7198,235068.66
109,94420.1000,91670.00
...,...,...
785624,147694.2544,143392.48
785641,159135.0000,154500.00
785648,235387.1875,228531.25
785682,167091.7500,162225.00


In [75]:
def projected_salary(row):
    if "Senior" in row['job_title_short']:
        return 1.05 * row['salary_year_avg']
    else:
        return 1.03 * row['salary_year_avg']

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,116168.5500,119653.606500
77,Data Engineer,148526.0000,152981.780000
92,Data Engineer,127308.0000,131127.240000
100,Data Scientist,242120.7198,249384.341394
109,Data Analyst,94420.1000,97252.703000
...,...,...,...
785624,Data Engineer,147694.2544,152125.082032
785641,Data Engineer,159135.0000,163909.050000
785648,Data Scientist,235387.1875,242448.803125
785682,Data Scientist,167091.7500,172104.502500
