In [23]:
# Dependencies and Setup
import pandas as pd
import hvplot
import hvplot.pandas
import requests
import geopandas as gpd
from pathlib import Path

In [24]:
# File to Load
USD_to_foreign = Path("USD to foreign currency.csv")

# Read USD Data File and store into Pandas DataFrames
USD_conversion = pd.read_csv(USD_to_foreign, header=None, index_col=0).to_dict()[1]
del USD_conversion["Currency"]
for key in USD_conversion:
    USD_conversion[key] = float(USD_conversion[key])
USD_conversion

{'GBP': 1.22, 'CAD': 0.72}

In [26]:
# File to Load
salary_data_to_load = Path("salaries.csv")

# Read Salary Data File and store into Pandas DataFrames
salary_data = pd.read_csv(salary_data_to_load)
salary_data.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,EN,FT,Machine Learning Engineer,36000,USD,36000,UG,100,US,M
1,2023,MI,FT,Research Engineer,300000,USD,300000,US,0,US,M
2,2023,MI,FT,Research Engineer,100000,USD,100000,US,0,US,M
3,2023,MI,FT,Research Engineer,300000,USD,300000,US,0,US,M
4,2023,MI,FT,Research Engineer,100000,USD,100000,US,0,US,M


In [27]:
#Locate the number of employees per residence 
employee_residence = salary_data.groupby("employee_residence").count()[["work_year"]].rename(columns={"work_year": "Count"}).sort_values(by=["Count"], ascending=False).reset_index()
employee_residence

Unnamed: 0,employee_residence,Count
0,US,6907
1,GB,396
2,CA,180
3,ES,111
4,IN,65
...,...,...
81,KW,1
82,LU,1
83,MT,1
84,MU,1


In [28]:
#Locate the number of employees per company location
employee_company_location = salary_data.groupby("company_location").count()[["work_year"]].rename(columns={"work_year": "Count"}).sort_values(by=["Count"], ascending=False).reset_index()
employee_company_location

Unnamed: 0,company_location,Count
0,US,6956
1,GB,403
2,CA,181
3,ES,108
4,DE,72
...,...,...
68,IQ,1
69,IR,1
70,MD,1
71,MT,1


In [29]:
# Filter salary based on residence 
employee_salary= salary_data[salary_data["employee_residence"].isin (employee_residence["employee_residence"].head(3))]
employee_salary

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
1,2023,MI,FT,Research Engineer,300000,USD,300000,US,0,US,M
2,2023,MI,FT,Research Engineer,100000,USD,100000,US,0,US,M
3,2023,MI,FT,Research Engineer,300000,USD,300000,US,0,US,M
4,2023,MI,FT,Research Engineer,100000,USD,100000,US,0,US,M
5,2023,MI,FT,Data Analyst,116700,USD,116700,US,0,US,M
...,...,...,...,...,...,...,...,...,...,...,...
8122,2021,SE,FT,Data Specialist,165000,USD,165000,US,100,US,L
8123,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
8124,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
8125,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S


In [30]:
#Filter salary based on company location
employee_salary_location= salary_data[salary_data["company_location"].isin (employee_company_location["company_location"].head(3))]
employee_salary_location

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,EN,FT,Machine Learning Engineer,36000,USD,36000,UG,100,US,M
1,2023,MI,FT,Research Engineer,300000,USD,300000,US,0,US,M
2,2023,MI,FT,Research Engineer,100000,USD,100000,US,0,US,M
3,2023,MI,FT,Research Engineer,300000,USD,300000,US,0,US,M
4,2023,MI,FT,Research Engineer,100000,USD,100000,US,0,US,M
...,...,...,...,...,...,...,...,...,...,...,...
8122,2021,SE,FT,Data Specialist,165000,USD,165000,US,100,US,L
8123,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
8124,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
8125,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S


In [31]:
# Find average Salary by Top 3 Employee Residence Locations
average_salary_by_employee_location= employee_salary.groupby("employee_residence")["salary_in_usd"].mean().round(2)
average_salary_by_employee_location

employee_residence
CA    143035.73
GB    107193.29
US    159168.48
Name: salary_in_usd, dtype: float64

In [32]:
# Find average Salary by Top 3 Company Locations
average_salary_by_company_location= employee_salary_location.groupby("company_location")["salary_in_usd"].mean().round(2)
average_salary_by_company_location

company_location
CA    142539.52
GB    106626.74
US    158613.88
Name: salary_in_usd, dtype: float64

In [33]:
# Find median salary by Top 3 Employee Residence Locations
median_salary_by_employee_location= employee_salary.groupby("employee_residence")["salary_in_usd"].median().round(2)
median_salary_by_employee_location

employee_residence
CA    134500.0
GB     92280.0
US    150000.0
Name: salary_in_usd, dtype: float64

In [34]:
# Find median salary by Top 3 Company Locations
median_salary_by_employee_location_salary_by_company_location= employee_salary_location.groupby("company_location")["salary_in_usd"].median().round(2)
median_salary_by_employee_location_salary_by_company_location

company_location
CA    135000.0
GB     92280.0
US    150000.0
Name: salary_in_usd, dtype: float64

In [35]:
# Find Mode Salary by Top 3 Employee Locations
mode_salary_by_employee_location= employee_salary.groupby("employee_residence")["salary_in_usd"].apply(pd.Series.mode)
mode_salary_by_employee_location

employee_residence   
CA                  0    190000
GB                  0     73824
US                  0    150000
Name: salary_in_usd, dtype: int64

In [36]:
# Find Mode Salary by Top 3 Company Locations 
mode_salary_by_employee_location_salary_by_company_location= employee_salary_location.groupby("company_location")["salary_in_usd"].apply(pd.Series.mode)
mode_salary_by_employee_location_salary_by_company_location

company_location   
CA                0    190000
GB                0     73824
US                0    150000
Name: salary_in_usd, dtype: int64

In [37]:
# Finding standard deviation of salary by employee location
standard_dev_of_salary_by_employee_location= employee_salary.groupby("employee_residence")["salary_in_usd"].std().round(2)
standard_dev_of_salary_by_employee_location

employee_residence
CA    60887.04
GB    62619.99
US    59012.15
Name: salary_in_usd, dtype: float64

In [38]:
# Finding standard deviation of salary by company location
standard_dev_salary_by_company_location= employee_salary_location.groupby("company_location")["salary_in_usd"].std().round(2)
standard_dev_salary_by_company_location

company_location
CA    62163.09
GB    62280.05
US    59361.42
Name: salary_in_usd, dtype: float64

In [39]:
finding_max= salary_data["salary_in_usd"].max()
finding_max

450000

In [40]:
finding_min= salary_data["salary_in_usd"].min()
finding_min

15000