<a href="https://www.kaggle.com/code/tmishinev/data-science-salaries-eda-plotly?scriptVersionId=105291175" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

![](https://images.unsplash.com/photo-1599658880436-c61792e70672?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1470&q=80)

Photo Source (Unsplash - Myriam Jessier) - https://images.unsplash.com/photo-1599658880436-c61792e70672?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1470&q=80

<div style='color:#203354;border: 2px outset #203354;;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:30px;font-weight:bold;text-align:center;'>
    Data Science Job Salaries Dataset
</div>

#### [1. EDA](#eda)
 - [1.1 Numerical Features](#numerical) <br>
 - [1.2 Categorical Features](#categorical) <br>

#### [2. Location](#location)
 - [2.1 Salary by Location](#salary_location)
 - [2.2 Experience Level by Location](#experience_location)
 
#### [3. Experience](#experience)
 - [3.1 Salary by Experience Level](#salary_experience)
 - [3.2 Remote Ratio by Experience Level](#remote_experience)
 
#### [4. Company Size](#company)
 - [4.1 Salary by Company Size](#salary_company)
 - [4.2 Remote Ratio by Company Size](#remote_company)
 - [4.3 Exp. Level by Company Size](#experience_company)

In [1]:
#install 
!pip install country_converter
import pandas as pd
import country_converter
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

WIDTH = 800

Collecting country_converter
  Downloading country_converter-0.7.7.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.2/51.2 kB[0m [31m375.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: country_converter
  Building wheel for country_converter (setup.py) ... [?25l- \ done
[?25h  Created wheel for country_converter: filename=country_converter-0.7.7-py3-none-any.whl size=53786 sha256=85ae13721173d95301e792a5d3c5d5a77f023e95d4ed476bf7f1c5ebdf0b79aa
  Stored in directory: /root/.cache/pip/wheels/e8/e6/60/61798a8a91462250002293d1c8cc8de90a130119a813277ccc
Successfully built country_converter
Installing collected packages: country_converter
Successfully installed country_converter-0.7.7
[0m

<a id = 'eda'></a>
## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:20px;font-weight:bold;text-align:left;'>1. EDA</div>

In [2]:
df = pd.read_csv('../input/data-science-job-salaries/ds_salaries.csv').drop(columns = ['Unnamed: 0'])
#preprocess values
df['experience_level'] = df['experience_level'].map({'EN':'Junior', 'MI':'Middle', 'SE':'Senior', 'EX':'Executive'})
df['company_location'] = country_converter.convert(names=df['company_location'], to="ISO3")
df['employee_residence'] = country_converter.convert(names=df['employee_residence'], to="ISO3")
df.head(3)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,Middle,FT,Data Scientist,70000,EUR,79833,DEU,0,DEU,L
1,2020,Senior,FT,Machine Learning Scientist,260000,USD,260000,JPN,0,JPN,S
2,2020,Senior,FT,Big Data Engineer,85000,GBP,109024,GBR,50,GBR,M


<a id = 'numerical'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>1.1 Numerical Features Distribution</div>

In [3]:
fig = px.histogram(df, x = 'salary_in_usd', color = 'work_year', nbins = 60,  marginal="box", opacity = 0.8, color_discrete_sequence = ['#203354', '#bbbbbb', '#751c35'])

fig.update_layout(width = WIDTH, title_text="Distribution of Salaries per Year")
fig.show()

In [4]:
def perc_distrib(df, x, y):
    df[y] = df[y].astype(str)
    df_rmt = df.groupby([x, y])[['salary']].count().reset_index()
    df_total = df.groupby([x])[['salary']].count().reset_index()

    df_rmt= df_rmt.merge(df_total, on = x)
    df_rmt[y + '_perc'] = round(df_rmt['salary_x']/df_rmt['salary_y']*100,1)


    fig = px.bar(df_rmt, x = x, y = y + '_perc', color = y, barmode="group",opacity = 0.8, color_discrete_sequence = ['#203354', '#bbbbbb', '#751c35', '#101010'])

    fig.update_layout(width = WIDTH, title_text=f"Distribution of {y} per {x} (Percentage)")
    return fig


fig = perc_distrib(df, 'work_year', 'remote_ratio')
fig.show()

<a id = 'categorical'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>1.2 Categorical Features </div>

In [5]:
fig = make_subplots(rows=5, cols=1, subplot_titles=("Experience Level", "Employment Type", "Employment Residence",  "Company Location",  "Company Size"))

numerical = ['experience_level', 'employment_type' ,'employee_residence' , 'company_location', 'company_size']

for idx, col in enumerate(numerical):

    fig.add_trace(go.Histogram(x = df[col],  marker = dict(color = '#203354', opacity = 0.8), name = col), row = ((idx)//1)+1, col = (idx)%1 + 1)
        
fig.update_layout(width = WIDTH,height = 2000, title_text="Categorical Features",showlegend=False)
fig.show()

In [6]:
df_jobs = df['job_title'].value_counts(ascending = True).reset_index().tail(10)


fig = go.Figure(go.Bar(y = df_jobs['index'], x = df_jobs['job_title'],orientation='h',  marker = dict(color = '#203354', opacity = 0.8)))
fig.update_layout(width = WIDTH,height = 500, title_text="Job Title Distribution",showlegend=False)
fig.show()

We can see that the dominant job titles are Data Scientist, Data Engineer and Data Analyst

<a id = 'location'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:20px;font-weight:bold;text-align:left;'>2. Location </div>

<a id = 'location_salary'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>2.1 Salary by Location </div>

In [7]:
df_mean = df.groupby('company_location')[['salary_in_usd']].mean().sort_values('salary_in_usd', ascending = False)

fig = px.choropleth(locations=df_mean.index,
                    color=df_mean['salary_in_usd'],
                    color_continuous_scale=px.colors.sequential.RdBu_r,
                    title = 'Mean Salary by Location')


fig.update_layout(width = WIDTH, height = 600, title_text="Salary by Company Location")
fig.show()

fig_bar = px.bar(x = df_mean.head(20).index, y = df_mean.head(20).salary_in_usd, color = df_mean.head(20).salary_in_usd, color_continuous_scale=px.colors.sequential.RdBu_r)
fig_bar.update_layout(width = WIDTH, height = 500, title_text="TOP 20 Salary by Company Location")
fig_bar.update_xaxes(title = 'Country')
fig_bar.update_yaxes(title = 'Salary in USD')
fig_bar.show()

<a id = 'experience_location'></a>
 
## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>2.2 Experience Level by Location (in % of total employees per Country) </div>

<font size = 3.5> Data from US is dominant in the dataset. We will calculate the percentage for each Experience Level category</font>

In [8]:
df_exp = df.groupby(['company_location', 'experience_level'])[['salary']].count().reset_index()
df_total = df.groupby(['company_location'])[['salary']].count().reset_index()

df_exp= df_exp.merge(df_total, on = 'company_location')
df_exp['exp_perc'] = round(df_exp['salary_x']/df_exp['salary_y']*100,1)

In [9]:
fig_j = px.choropleth(locations=df_exp.loc[df_exp['experience_level'] == 'Junior']['company_location'],
                    color=df_exp.loc[df_exp['experience_level'] == 'Junior']['exp_perc'],
                    color_continuous_scale=px.colors.sequential.RdBu_r,
                    title = 'Mean Salary by Location')
fig_j.update_layout(width = WIDTH, height = 600, title_text="Junior Employees as Percentage of All")
fig_j.show()

fig_m = px.choropleth(locations=df_exp.loc[df_exp['experience_level'] == 'Middle']['company_location'],
                    color=df_exp.loc[df_exp['experience_level'] == 'Middle']['exp_perc'],
                    color_continuous_scale=px.colors.sequential.RdBu_r,
                    title = 'Mean Salary by Location')
fig_m.update_layout(width = WIDTH, height = 600, title_text="Middle-Level Employees as Percentage of All")
fig_m.show()

fig_s = px.choropleth(locations=df_exp.loc[df_exp['experience_level'] == 'Senior']['company_location'],
                    color=df_exp.loc[df_exp['experience_level'] == 'Senior']['exp_perc'],
                    color_continuous_scale=px.colors.sequential.RdBu_r,
                    title = 'Mean Salary by Location')
fig_s.update_layout(width = WIDTH, height = 600, title_text="Senior Employees as Percentage of All")
fig_s.show()

fig_e = px.choropleth(locations=df_exp.loc[df_exp['experience_level'] == 'Executive']['company_location'],
                    color=df_exp.loc[df_exp['experience_level'] == 'Executive']['exp_perc'],
                    color_continuous_scale=px.colors.sequential.RdBu_r,
                    title = 'Mean Salary by Location')
fig_e.update_layout(width = WIDTH, height = 600, title_text="Executive Employees as Percentage of All")
fig_e.show()

<a id = 'experience'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:20px;font-weight:bold;text-align:left;'>3. Experience Level </div>

<a id = 'salary_experience'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>3.1 Salary by Experience level </div>

In [10]:
fig = px.histogram(df, x = 'salary_in_usd', color = 'experience_level', nbins = 60,marginal="box", opacity = 0.8, color_discrete_sequence = ['#203354', '#bbbbbb', '#751c35', '#101010'])

fig.update_layout(width = WIDTH, title_text="Distribution of Salaries per Exp. Level")
fig.show()

<a id = 'remote_experience'></a>
## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>3.2 Remote ratio by Experience level </div>

In [11]:

fig = perc_distrib(df, 'experience_level', 'remote_ratio')
fig.show()

<a id = 'company'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:20px;font-weight:bold;text-align:left;'>4. Company Size</div>

<a id = 'salary_company'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>4.1 Salary by Company Size</div>

In [12]:
fig = px.histogram(df, x = 'salary_in_usd', color = 'company_size',nbins = 60, marginal="box", opacity = 0.8, color_discrete_sequence = ['#203354', '#bbbbbb', '#751c35'])

fig.update_layout(width = WIDTH, title_text="Distribution of Salaries per Exp. Level")
fig.show()

<a id = 'remote_company'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>4.2 Remote Ratio by Company Size</div>

In [13]:
fig = perc_distrib(df, 'company_size', 'remote_ratio')
fig.show()

<a id = 'experience_company'></a>

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:17px;font-weight:bold;text-align:left;'>4.3 Experience Level by Company Size</div>

In [14]:
fig = perc_distrib(df, 'company_size', 'experience_level')
fig.show()

## 

## <div style='color:#203354;background-color:rgba(222, 222, 222, 0.85);padding:10px 10px;font-size:20px;font-weight:bold;text-align:center;'>Thanks for reading and upvote if you like it :)</div>

World Population Dataset Notebook: https://www.kaggle.com/code/tmishinev/world-population-eda-map-animations