# Data Scientist Salaries - Data Processing and analysis  

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
from sklearn.preprocessing import LabelEncoder

## Data Preparation

In [147]:
df = pd.read_csv('ds_salaries.csv')
print(df.shape)
print('\n')
df.info()

(3755, 11)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [148]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [149]:
df.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [150]:
# Extract categorical variables
cat_vars = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']

# Create LabelEncoder object
encoder = LabelEncoder()

# Encode categorical variables as integers
for var in cat_vars:
    df[var] = encoder.fit_transform(df[var])

In [151]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,3,2,84,80000,EUR,85847,26,100,25,0
1,2023,2,0,66,30000,USD,30000,75,100,70,2
2,2023,2,0,66,25500,USD,25500,75,100,70,2
3,2023,3,2,47,175000,USD,175000,11,100,12,1
4,2023,3,2,47,120000,USD,120000,11,100,12,1


In [152]:
df.drop(['salary','salary_currency'] , axis='columns', inplace = True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,3,2,84,85847,26,100,25,0
1,2023,2,0,66,30000,75,100,70,2
2,2023,2,0,66,25500,75,100,70,2
3,2023,3,2,47,175000,11,100,12,1
4,2023,3,2,47,120000,11,100,12,1


## Data Transformation

In [191]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df2 = scaler.fit_transform(df)

## Feature Selection

## Regression - Salary Prediction

In this section, a regression model is going to be trained and tested to predict the salaries of Data Scientists based on the other featuress.

In [74]:
df.salary_in_usd.describe()

count      3755.000000
mean     137570.389880
std       63055.625278
min        5132.000000
25%       95000.000000
50%      135000.000000
75%      175000.000000
max      450000.000000
Name: salary_in_usd, dtype: float64

In [192]:
from sklearn.model_selection import train_test_split

df2 = pd.DataFrame(df2, columns = df.columns)

features = df2.drop('salary_in_usd', axis=1)
target = df2['salary_in_usd'].values

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, shuffle=True)

In [193]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

In [194]:
print(X_train.shape)
df3 = pd.DataFrame(X_train)
df3.head()

(3004, 8)


Unnamed: 0,work_year,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size
2238,-0.540438,0.585736,0.025927,-0.359431,-2.087565,-0.952433,-2.121818,0.207876
485,0.905994,-0.517846,0.025927,2.662006,0.460186,1.105918,0.450625,0.207876
2177,-0.540438,0.585736,0.025927,-0.791065,0.460186,-0.952433,0.450625,0.207876
3305,-0.540438,0.585736,0.025927,-0.359431,0.460186,1.105918,0.450625,0.207876
1769,0.905994,0.585736,0.025927,-0.359431,0.460186,1.105918,0.450625,0.207876


In [195]:
model = Sequential([
    tf.keras.Input(shape=(8,)),
    Dense(units = 25, activation = 'relu'),
    Dense(units = 10, activation = 'relu'),
    Dense(units = 5, activation = 'relu'),
    Dense(units = 1, activation = 'linear')
])

In [206]:
from keras.optimizers import Adam
from keras.losses import MeanSquaredError
model.compile(optimizer = Adam(learning_rate = 0.01), loss='mean_squared_error')
model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_65 (Dense)            (None, 25)                225       
                                                                 
 dense_66 (Dense)            (None, 10)                260       
                                                                 
 dense_67 (Dense)            (None, 5)                 55        
                                                                 
 dense_68 (Dense)            (None, 1)                 6         
                                                                 
Total params: 546
Trainable params: 546
Non-trainable params: 0
_________________________________________________________________


In [207]:
y_train

array([-1.59876005,  0.22569763, -0.59590867, ...,  2.73492768,
       -1.18833756, -0.84884603])

In [208]:
model.fit(X_train, y_train, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x233d6bea790>

In [209]:
y_hat = model.predict(X_test)



In [210]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred = y_hat, y_true = y_test)

0.658391269666481