# Predicting Salary
## Using job role and work location

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.metrics import r2_score

In [7]:
data = pd.read_json("../data/employees.json")

In [8]:
data.head()

Unnamed: 0,user_id,first_name,last_name,full_name,phone_number,job_role,work_location,salary,direct_reports,is_hr
0,b16e018f-42d0-11ef-a1bb-08d23ea34e9e,Deborah,Vargas,Deborah Vargas,(730) 324-7284,Chief Executive Officer,Phoenix,313227,"[b16e2857-42d0-11ef-a636-08d23ea34e9e, b4a29c6...",False
1,b16e2857-42d0-11ef-a636-08d23ea34e9e,Pamela,White,Pamela White,(945) 954-4251,Chief Technology Officer,New York City,520214,"[b16e2858-42d0-11ef-b242-08d23ea34e9e, b16f39d...",False
2,b16e2858-42d0-11ef-b242-08d23ea34e9e,Betty,Hammond,Betty Hammond,(363) 628-4343,Director of Engineering,San Francisco,443669,"[b1713584-42d0-11ef-a25f-08d23ea34e9e, b1715ca...",False
3,b16f39d3-42d0-11ef-8b01-08d23ea34e9e,Jennifer,Lakin,Jennifer Lakin,(370) 272-8677,Director of Engineering,Hartford,231082,"[b1713585-42d0-11ef-804c-08d23ea34e9e, b1715ca...",False
4,b170e767-42d0-11ef-93c5-08d23ea34e9e,Joseph,Gobel,Joseph Gobel,(236) 484-6638,Director of Engineering,Philadelphia,330021,[b1713586-42d0-11ef-94d4-08d23ea34e9e],False


In [9]:
data["job_role"].value_counts()

job_role
Junior Developer                1000
Intern                           200
Software Engineer                100
Senior Software Engineer          50
HR Assistant                      50
Technical Lead                    20
HR Associate                      20
Project Manager                   10
Engineering Manager                5
Director of Engineering            3
HR Director                        3
Chief Executive Officer            1
Chief Technology Officer           1
Chief Communications Officer       1
Name: count, dtype: int64

In [10]:
data["work_location"].value_counts()

work_location
Los Angeles      150
Phoenix          142
San Francisco    141
Philadelphia     136
Chicago          135
San Jose         133
Dallas           131
Hartford         128
Houston          127
New York City    121
San Diego        120
Name: count, dtype: int64

In [11]:
# Convert categorical data into boolean based attributes
# Get train and test sets
X = pd.get_dummies(data[["job_role", "work_location"]])
y = data["salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.head()

Unnamed: 0,job_role_Chief Communications Officer,job_role_Chief Executive Officer,job_role_Chief Technology Officer,job_role_Director of Engineering,job_role_Engineering Manager,job_role_HR Assistant,job_role_HR Associate,job_role_HR Director,job_role_Intern,job_role_Junior Developer,...,work_location_Dallas,work_location_Hartford,work_location_Houston,work_location_Los Angeles,work_location_New York City,work_location_Philadelphia,work_location_Phoenix,work_location_San Diego,work_location_San Francisco,work_location_San Jose
1330,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
724,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
254,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1068,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
1193,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False


In [13]:
y_train.head()

1330     58935
724     126916
254      86272
1068     81061
1193     57613
Name: salary, dtype: int64

In [14]:
# Create Linear Regression model
linreg_clf = LinearRegression()
linreg_clf.fit(X_train, y_train)

In [15]:
# Saving model to disk
file_path = 'salary_predictor.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(linreg_clf, file)

In [16]:
y_pred = linreg_clf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(r2)

0.9359915510540915


In [17]:
X_test.head(5)

Unnamed: 0,job_role_Chief Communications Officer,job_role_Chief Executive Officer,job_role_Chief Technology Officer,job_role_Director of Engineering,job_role_Engineering Manager,job_role_HR Assistant,job_role_HR Associate,job_role_HR Director,job_role_Intern,job_role_Junior Developer,...,work_location_Dallas,work_location_Hartford,work_location_Houston,work_location_Los Angeles,work_location_New York City,work_location_Philadelphia,work_location_Phoenix,work_location_San Diego,work_location_San Francisco,work_location_San Jose
1296,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
175,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
275,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
548,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
1000,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False


In [18]:
y_test.head(50)

1296     57223
175     171511
275     109732
548      66022
1000    117225
917     106682
990      70758
881     129863
1066     90925
1457    112645
1251     36121
322      81121
615      98438
811      98288
1417     80802
218      69331
810     119298
76      163309
49      208485
1089    111279
67      128159
1177     81110
1326     63038
882      71667
244      63322
937      86632
426      98329
433      65050
1141    121580
613      86802
718      79414
764      89826
885     118585
411      67716
900      94959
192     115779
168     178083
916      58346
1252     34893
380     125584
950     120021
203     124288
855      92778
847     115070
1118    107072
701     118008
1075     97580
1201     36769
277      84141
774      76842
Name: salary, dtype: int64

In [19]:
y_pred[:50]

array([ 52312., 162928., 105784.,  69000., 125448., 112568.,  61720.,
       125448.,  87512., 106512.,  30936.,  82408., 105784., 105784.,
        73296.,  69000., 125448., 166000., 196160., 120120., 152592.,
        82408.,  70936.,  69000.,  69000.,  87512., 105784.,  69000.,
       125448.,  82408.,  87512.,  93944., 120120.,  72568., 100120.,
       120120., 162928.,  61720.,  30936., 125448., 112568., 125448.,
        93944., 125448., 100120., 112568.,  87512.,  20088.,  93944.,
        69000.])