- experience_level: The experience level in the job during the year.
  - EN > Entry-level / Junior
  - MI > Mid-level / Intermediate
  - SE > Senior-level / Expert
  - EX > Executive-level / Director
- employment_type: The type of employment for the role.
  - PT > Part-time
  - FT > Full-time
  - CT > Contract
  - FL > Freelance
- job_title: The role worked in during the year.
- salary: The total gross salary amount paid.
- salary_currency: The currency of the salary paid as an ISO 4217 currency code.
- salaryinusd: The salary in USD.
- employee_residence: Employee's primary country of residence during the work year as an ISO 3166 country code.
- remote_ratio: The overall amount of work done remotely.
- company_location: The country of the employer's main office or contracting branch.
- company_size: The median number of people that worked for the company during the year.

In [184]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [185]:
df = pd.read_csv("Data_Scientist_Salaries.csv")

In [186]:
df.shape

(3755, 11)

In [187]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [188]:
df.isnull().sum()

Unnamed: 0,0
work_year,0
experience_level,0
employment_type,0
job_title,0
salary,0
salary_currency,0
salary_in_usd,0
employee_residence,0
remote_ratio,0
company_location,0


In [189]:
df.duplicated().sum()

1171

In [190]:
df.drop_duplicates(inplace=True)

In [191]:
df.reset_index(drop=True, inplace=True)

In [192]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [193]:
df.drop(['work_year', 'salary', 'salary_currency', 'employee_residence', 'remote_ratio'], axis=1, inplace=True)

In [194]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   experience_level  2584 non-null   object
 1   employment_type   2584 non-null   object
 2   job_title         2584 non-null   object
 3   salary_in_usd     2584 non-null   int64 
 4   company_location  2584 non-null   object
 5   company_size      2584 non-null   object
dtypes: int64(1), object(5)
memory usage: 121.2+ KB


In [195]:
cat_col = [col for col in df.columns if df[col].dtype == 'O']
cat_col

['experience_level',
 'employment_type',
 'job_title',
 'company_location',
 'company_size']

In [196]:
for col in cat_col:
    print(col, df[col].unique())

experience_level ['SE' 'MI' 'EN' 'EX']
employment_type ['FT' 'CT' 'FL' 'PT']
job_title ['Principal Data Scientist' 'ML Engineer' 'Data Scientist'
 'Applied Scientist' 'Data Analyst' 'Data Modeler' 'Research Engineer'
 'Analytics Engineer' 'Business Intelligence Engineer'
 'Machine Learning Engineer' 'Data Strategist' 'Data Engineer'
 'Computer Vision Engineer' 'Data Quality Analyst'
 'Compliance Data Analyst' 'Data Architect'
 'Applied Machine Learning Engineer' 'AI Developer' 'Research Scientist'
 'Data Analytics Manager' 'Business Data Analyst' 'Applied Data Scientist'
 'Staff Data Analyst' 'ETL Engineer' 'Data DevOps Engineer' 'Head of Data'
 'Data Science Manager' 'Data Manager' 'Machine Learning Researcher'
 'Big Data Engineer' 'Data Specialist' 'Lead Data Analyst'
 'BI Data Engineer' 'Director of Data Science'
 'Machine Learning Scientist' 'MLOps Engineer' 'AI Scientist'
 'Autonomous Vehicle Technician' 'Applied Machine Learning Scientist'
 'Lead Data Scientist' 'Cloud Database E

In [197]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['EN', 'MI', 'SE', 'EX']])
df['experience_level'] = oe.fit_transform(df[['experience_level']])

In [198]:
oe = OrdinalEncoder(categories=[['S','M', 'L']])
df['company_size'] = oe.fit_transform(df[['company_size']])

In [199]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

label_mappings = {}

for col in cat_col[1:-1]:
    df[col] = le.fit_transform(df[col])
    label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))

for col, mapping in label_mappings.items():
    print(f"Column: {col}")
    print(mapping)
    print()

Column: employment_type
{'CT': 0, 'FL': 1, 'FT': 2, 'PT': 3}

Column: job_title
{'3D Computer Vision Researcher': 0, 'AI Developer': 1, 'AI Programmer': 2, 'AI Scientist': 3, 'Analytics Engineer': 4, 'Applied Data Scientist': 5, 'Applied Machine Learning Engineer': 6, 'Applied Machine Learning Scientist': 7, 'Applied Scientist': 8, 'Autonomous Vehicle Technician': 9, 'Azure Data Engineer': 10, 'BI Analyst': 11, 'BI Data Analyst': 12, 'BI Data Engineer': 13, 'BI Developer': 14, 'Big Data Architect': 15, 'Big Data Engineer': 16, 'Business Data Analyst': 17, 'Business Intelligence Engineer': 18, 'Cloud Data Architect': 19, 'Cloud Data Engineer': 20, 'Cloud Database Engineer': 21, 'Compliance Data Analyst': 22, 'Computer Vision Engineer': 23, 'Computer Vision Software Engineer': 24, 'Data Analyst': 25, 'Data Analytics Consultant': 26, 'Data Analytics Engineer': 27, 'Data Analytics Lead': 28, 'Data Analytics Manager': 29, 'Data Analytics Specialist': 30, 'Data Architect': 31, 'Data DevOps E

In [200]:
df.head()

Unnamed: 0,experience_level,employment_type,job_title,salary_in_usd,company_location,company_size
0,2.0,2,84,85847,25,2.0
1,1.0,0,66,30000,70,0.0
2,1.0,0,66,25500,70,0.0
3,2.0,2,47,175000,12,1.0
4,2.0,2,47,120000,12,1.0


In [201]:
X, y = df.drop('salary_in_usd', axis=1), df['salary_in_usd']

In [202]:
X

Unnamed: 0,experience_level,employment_type,job_title,company_location,company_size
0,2.0,2,84,25,2.0
1,1.0,0,66,70,0.0
2,1.0,0,66,70,0.0
3,2.0,2,47,12,1.0
4,2.0,2,47,12,1.0
...,...,...,...,...,...
2579,2.0,2,47,70,2.0
2580,1.0,2,84,70,2.0
2581,0.0,2,47,70,0.0
2582,0.0,0,17,70,2.0


In [203]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

In [204]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [205]:
from xgboost import XGBRegressor
model = XGBRegressor()

In [206]:
model.fit(X_train, y_train)

In [207]:
y_pred = model.predict(X_test)

In [208]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.41820453800812285

In [222]:
param_grid = {
    'n_estimators': [100, 200, 250, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5,7],
}

In [223]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

In [224]:
grid.fit(X_train, y_train)

In [225]:
grid.best_params_

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 250}

In [226]:
grid.best_score_

0.3942778765428095

In [227]:
y_pred_grid = grid.predict(X_test)

In [228]:
print(r2_score(y_test, y_pred_grid))

0.4677276809288148
