# Predicting Salary
## Using job role and work location

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.metrics import r2_score

In [21]:
data = pd.read_json("../data/employees.json")

In [22]:
data.head()

Unnamed: 0,user_id,first_name,last_name,full_name,phone_number,email,job_role,work_location,salary,direct_reports,is_hr,manager
0,d754fb95-42d5-11ef-b67e-08d23ea34e9e,Martha,Jones,Martha Jones,(975) 387-8507,marthajones@example.com,Chief Executive Officer,Chicago,434869,"[d755bea7-42d5-11ef-8029-08d23ea34e9e, daaa967...",False,
1,d755bea7-42d5-11ef-8029-08d23ea34e9e,Mary,Sass,Mary Sass,(561) 431-6750,marysass@example.com,Chief Technology Officer,Houston,299824,"[d757454c-42d5-11ef-871f-08d23ea34e9e, d757454...",False,d754fb95-42d5-11ef-b67e-08d23ea34e9e
2,d757454c-42d5-11ef-871f-08d23ea34e9e,Dean,Brady,Dean Brady,(755) 764-1137,deanbrady@example.com,Director of Engineering,Philadelphia,317775,"[d757ba76-42d5-11ef-b908-08d23ea34e9e, d7591a0...",False,d755bea7-42d5-11ef-8029-08d23ea34e9e
3,d757454d-42d5-11ef-a14a-08d23ea34e9e,Thomas,Richardson,Thomas Richardson,(960) 497-2473,thomasrichardson@example.com,Director of Engineering,San Jose,419265,"[d7587e1a-42d5-11ef-b624-08d23ea34e9e, d75c4e8...",False,d755bea7-42d5-11ef-8029-08d23ea34e9e
4,d757ba75-42d5-11ef-b394-08d23ea34e9e,Austin,Bays,Austin Bays,(409) 958-7548,austinbays@example.com,Director of Engineering,Hartford,221947,[d7587e1b-42d5-11ef-bb04-08d23ea34e9e],False,d755bea7-42d5-11ef-8029-08d23ea34e9e


In [23]:
data["job_role"].value_counts()

job_role
Junior Developer                1000
Intern                           200
Software Engineer                100
Senior Software Engineer          50
HR Assistant                      50
Technical Lead                    20
HR Associate                      20
Project Manager                   10
Engineering Manager                5
Director of Engineering            3
HR Director                        3
Chief Executive Officer            1
Chief Technology Officer           1
Chief Communications Officer       1
Name: count, dtype: int64

In [24]:
data["work_location"].value_counts()

work_location
Los Angeles      145
Houston          139
San Jose         139
Dallas           138
Philadelphia     137
Hartford         137
San Francisco    136
Phoenix          133
San Diego        121
New York City    120
Chicago          119
Name: count, dtype: int64

In [25]:
# Convert categorical data into boolean based attributes
# Get train and test sets
X = pd.get_dummies(data[["job_role", "work_location"]])
y = data["salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X_train.head()

Unnamed: 0,job_role_Chief Communications Officer,job_role_Chief Executive Officer,job_role_Chief Technology Officer,job_role_Director of Engineering,job_role_Engineering Manager,job_role_HR Assistant,job_role_HR Associate,job_role_HR Director,job_role_Intern,job_role_Junior Developer,...,work_location_Dallas,work_location_Hartford,work_location_Houston,work_location_Los Angeles,work_location_New York City,work_location_Philadelphia,work_location_Phoenix,work_location_San Diego,work_location_San Francisco,work_location_San Jose
1330,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
724,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
254,False,False,False,False,False,False,False,False,False,True,...,True,False,False,False,False,False,False,False,False,False
1068,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
1193,False,False,False,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False


In [27]:
y_train.head()

1330     50292
724     110995
254      74899
1068     69107
1193     40952
Name: salary, dtype: int64

In [28]:
# Create Linear Regression model
linreg_clf = LinearRegression()
linreg_clf.fit(X_train, y_train)

In [29]:
# Saving model to disk
file_path = 'salary_predictor.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(linreg_clf, file)

In [30]:
y_pred = linreg_clf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(r2)

0.9375985062257842


In [31]:
X_test.head(5)

Unnamed: 0,job_role_Chief Communications Officer,job_role_Chief Executive Officer,job_role_Chief Technology Officer,job_role_Director of Engineering,job_role_Engineering Manager,job_role_HR Assistant,job_role_HR Associate,job_role_HR Director,job_role_Intern,job_role_Junior Developer,...,work_location_Dallas,work_location_Hartford,work_location_Houston,work_location_Los Angeles,work_location_New York City,work_location_Philadelphia,work_location_Phoenix,work_location_San Diego,work_location_San Francisco,work_location_San Jose
1296,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
175,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
275,False,False,False,False,False,False,False,False,False,True,...,False,False,True,False,False,False,False,False,False,False
548,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1000,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False


In [32]:
y_test.head(50)

1296     52754
175     146334
275      69184
548      86344
1000    127127
917      84661
990     109021
881     107277
1066     83407
1457    103759
1251     45756
322      56279
615     106663
811     131883
1417    103525
218     110051
810     111546
76      124371
49      171786
1089     64975
67      117143
1177     63125
1326     43370
882      77143
244      87208
937      73326
426      80677
433     122513
1141     83541
613     129601
718     100697
764     104727
885     124560
411      93452
900      62035
192      70968
168     114510
916      92806
1252     53330
380      88489
950      70300
203     126626
855      72981
847      64929
1118    112676
701     125334
1075     76590
1201     37150
277      90237
774     117397
Name: salary, dtype: int64

In [33]:
y_pred[:50]

array([ 58816., 141184.,  75840.,  94208., 125696.,  87936., 116800.,
       100736.,  75840., 116480.,  39616.,  61056., 107200., 125696.,
        93888., 116800., 107200., 141120., 174272.,  68544., 141120.,
        61056.,  52288.,  75840.,  94208.,  75840.,  75840., 125696.,
        87936., 125696., 100736., 116800., 118784.,  87936.,  61056.,
        75840., 116288., 100736.,  39616.,  87936.,  61056., 125696.,
        75840.,  61056., 118784., 118784.,  75840.,  39616.,  87936.,
       125696.])