In [2]:
import pandas
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

pandas.options.display.max_columns = 10
pandas.options.display.max_rows = 10

ds_salary_data = pandas.read_csv("ds_salaries.csv")

ds_salary_data

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,...,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,...,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,...,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,...,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,...,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,...,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,SE,FT,Data Engineer,...,154000,US,100,US,M
603,603,2022,SE,FT,Data Engineer,...,126000,US,100,US,M
604,604,2022,SE,FT,Data Analyst,...,129000,US,0,US,M
605,605,2022,SE,FT,Data Analyst,...,150000,US,100,US,M


In [4]:
feature_columns = ds_salary_data.columns.tolist()
feature_columns.remove("Unnamed: 0")
feature_columns.remove("salary_in_usd")
feature_columns.remove("company_location")
feature_columns.remove("job_title")
feature_columns.remove("salary")
feature_columns.remove("salary_currency")
feature_columns.remove("employee_residence")
feature_columns

['work_year',
 'experience_level',
 'employment_type',
 'remote_ratio',
 'company_size']

In [5]:
df_x = pandas.get_dummies(data=ds_salary_data[feature_columns], drop_first=True)
df_y = ds_salary_data["salary_in_usd"]

In [6]:
df_x.describe()

Unnamed: 0,work_year,remote_ratio,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_FL,employment_type_FT,employment_type_PT,company_size_M,company_size_S
count,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0
mean,2021.405272,70.92257,0.042834,0.350906,0.461285,0.00659,0.968699,0.016474,0.537068,0.136738
std,0.692133,40.70913,0.202649,0.477647,0.49891,0.080976,0.174275,0.127396,0.499035,0.343854
min,2020.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2021.0,50.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,2022.0,100.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75%,2022.0,100.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
max,2022.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
reg = linear_model.LinearRegression()

In [8]:
x_train,x_test,y_train,y_test = train_test_split(df_x,df_y,test_size = .25,random_state=0)

In [9]:
reg.fit(x_train, y_train)

In [10]:
y_pred = reg.predict(x_test)
mean_absolute_error(y_test,y_pred)

46658.457592631086

In [12]:
feature_importance = reg.coef_

In [31]:
for i,v in enumerate(feature_importance):
    print('Feature: {}, Score: {}'.format( pandas.get_dummies(data=ds_salary_data[feature_columns], drop_first=True).columns[i],v))

Feature: work_year, Score: 4149.076992744721
Feature: remote_ratio, Score: 136.25098820704716
Feature: experience_level_EX, Score: 97458.51303406496
Feature: experience_level_MI, Score: 23658.211847422073
Feature: experience_level_SE, Score: 68212.83726766534
Feature: employment_type_FL, Score: -44772.113718193
Feature: employment_type_FT, Score: 5164.2379197521095
Feature: employment_type_PT, Score: -29671.557980194128
Feature: company_size_M, Score: -13347.825183984763
Feature: company_size_S, Score: -29533.849156243086


In [13]:
y_pred=reg.predict(x_test)
print(y_pred)
print(y_test)

[141320.35945071  41810.79481494  83140.63520976 141320.35945071
 133022.20546522   6974.99891499 141043.08581398  73107.52218304
 150519.10764194 154668.18463469 127695.26063    141320.35945071
  99151.93281135 141320.35945071 154668.18463469  96765.73403046
 141320.35945071 141320.35945071  73107.52218304 141320.35945071
 101815.40522896 150519.10764194 105964.4822217  125134.33547845
 150230.9342521  110113.55921445  62805.53424475  52772.42121804
  99151.93281135  83140.63520976 127695.26063    141320.35945071
  86455.34736702  78991.55821701  92339.383401   141320.35945071
 127695.26063    141320.35945071  96765.73403046 127695.26063
  99151.93281135 143706.55823159 101815.40522896 141320.35945071
 141320.35945071 170566.03521711  83140.63520976 141320.35945071
 103211.08267225  96765.73403046 141320.35945071  83140.63520976
 141320.35945071  62805.53424475 154668.18463469 150519.10764194
  96488.46039374 141320.35945071 101815.40522896 150519.10764194
  68958.4451903   82306.2703