## imports

In [2]:
import pandas as pd
import numpy as np
from dateutil import parser
import os 

## load data

In [10]:
df = pd.read_csv('../raw_data/ds_salaries.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [11]:
df.describe(include='all')

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
count,607.0,607.0,607,607,607,607.0,607,607.0,607,607.0,607,607
unique,,,4,4,50,,17,,57,,50,3
top,,,SE,FT,Data Scientist,,USD,,US,,US,M
freq,,,280,588,143,,398,,332,,355,326
mean,303.0,2021.405272,,,,324000.1,,112297.869852,,70.92257,,
std,175.370085,0.692133,,,,1544357.0,,70957.259411,,40.70913,,
min,0.0,2020.0,,,,4000.0,,2859.0,,0.0,,
25%,151.5,2021.0,,,,70000.0,,62726.0,,50.0,,
50%,303.0,2022.0,,,,115000.0,,101570.0,,100.0,,
75%,454.5,2022.0,,,,165000.0,,150000.0,,100.0,,


## drop unnecessary collumns

In [12]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


## clean salary

In [13]:
df['salary_in_usd'] = pd.to_numeric(df['salary_in_usd'], errors='coerce') # Convert to numeric, set errors to NaN

df['salary_in_usd'] = df['salary_in_usd'].fillna(df['salary_in_usd'].median()) # Fill NaN with median salary

## Map experience lvl

In [14]:
exp_map = {
    'EN': 'Entry level',
    'MI': 'Mid level',
    'SE': 'Senior level',
    'EX': 'Executive level'
}

df['experience_level'] = df['experience_level'].map(exp_map)

## handle categorical columns

In [15]:
print(df['employment_type'].unique())
print(df['company_size'].unique())

df['employment_type'] = df['employment_type'].fillna('FT')  # Fill missing employment types with 'FT'
df['company_size'] = df['company_size'].fillna('M') 

['FT' 'CT' 'PT' 'FL']
['L' 'S' 'M']


# Save the cleaned dataset

In [16]:
import os 

os.makedirs('../cleaned_data', exist_ok=True)
df.to_csv('../cleaned_data/salaries_cleaned.csv', index=False)

