# Predicting Salary
## Using job role and work location

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.metrics import r2_score

In [2]:
data = pd.read_json("../data/employees.json")

In [4]:
data.head()

Unnamed: 0,user_id,first_name,last_name,full_name,phone_number,job_role,work_location,salary,direct_reports
0,e699f352-421f-11ef-9ed8-08d23ea34e9e,Ethel,Demott,Ethel Demott,(172) 472-1498,Chief Executive Officer,Hartford,276590,[e69ca397-421f-11ef-ac40-08d23ea34e9e]
1,e69ca397-421f-11ef-ac40-08d23ea34e9e,Kevin,Mcintyre,Kevin Mcintyre,(734) 731-7969,Chief Technology Officer,Hartford,247144,"[e69d194c-421f-11ef-9762-08d23ea34e9e, e69db58..."
2,e69d194c-421f-11ef-9762-08d23ea34e9e,Inell,King,Inell King,(321) 669-1282,Director of Engineering,Phoenix,247949,"[e69ec6ff-421f-11ef-9a17-08d23ea34e9e, e6a032a..."
3,e69db58d-421f-11ef-b5c9-08d23ea34e9e,Anna,Creed,Anna Creed,(114) 924-1765,Director of Engineering,Los Angeles,384159,"[e69ef80b-421f-11ef-8d90-08d23ea34e9e, e6a3325..."
4,e69e51ce-421f-11ef-99df-08d23ea34e9e,Leah,Brim,Leah Brim,(395) 560-2898,Director of Engineering,New York City,471581,[e69ef80c-421f-11ef-9071-08d23ea34e9e]


In [7]:
data["job_role"].value_counts()

job_role
Junior Developer            1000
Intern                       200
Software Engineer            100
Senior Software Engineer      50
Technical Lead                20
Project Manager               10
Engineering Manager            5
Director of Engineering        3
Chief Executive Officer        1
Chief Technology Officer       1
Name: count, dtype: int64

In [8]:
data["work_location"].value_counts()

work_location
Hartford         144
Houston          138
Los Angeles      136
New York City    135
San Jose         126
San Francisco    124
San Diego        124
Chicago          120
Phoenix          118
Philadelphia     114
Dallas           111
Name: count, dtype: int64

In [9]:
# Convert categorical data into boolean based attributes
# Get train and test sets
X = pd.get_dummies(data[["job_role", "work_location"]])
y = data["salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
X_train.head()

Unnamed: 0,job_role_Chief Executive Officer,job_role_Chief Technology Officer,job_role_Director of Engineering,job_role_Engineering Manager,job_role_Intern,job_role_Junior Developer,job_role_Project Manager,job_role_Senior Software Engineer,job_role_Software Engineer,job_role_Technical Lead,...,work_location_Dallas,work_location_Hartford,work_location_Houston,work_location_Los Angeles,work_location_New York City,work_location_Philadelphia,work_location_Phoenix,work_location_San Diego,work_location_San Francisco,work_location_San Jose
538,False,False,False,False,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1333,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
982,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
812,False,False,False,False,False,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1159,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [19]:
y_train.head()

538      59941
1333     52957
982      81513
812      78220
1159    119461
Name: salary, dtype: int64

In [11]:
# Create Linear Regression model
linreg_clf = LinearRegression()
linreg_clf.fit(X_train, y_train)

In [13]:
# Saving model to disk
file_path = 'salary_predictor.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(linreg_clf, file)

In [24]:
y_pred = linreg_clf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(r2)

0.9440116632109483


In [30]:
X_test.head(5)

Unnamed: 0,job_role_Chief Executive Officer,job_role_Chief Technology Officer,job_role_Director of Engineering,job_role_Engineering Manager,job_role_Intern,job_role_Junior Developer,job_role_Project Manager,job_role_Senior Software Engineer,job_role_Software Engineer,job_role_Technical Lead,...,work_location_Dallas,work_location_Hartford,work_location_Houston,work_location_Los Angeles,work_location_New York City,work_location_Philadelphia,work_location_Phoenix,work_location_San Diego,work_location_San Francisco,work_location_San Jose
558,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
168,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
240,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
664,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
271,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [26]:
y_test.head(50)

558      94101
168     138497
240     100749
664      69104
271      71887
585     113502
1329     71297
552      97491
901     124161
1048     72426
680     103222
1177    102420
344     118267
270     129773
67      224161
231     121642
720      71159
101     118607
163     173407
497      85189
915     125269
937     110221
1088     77748
115     111693
618      99138
1217     33307
745     109166
782      60955
435     125773
322      67742
695     115807
1249     50984
78      155651
354     102860
1155    124916
342      83423
869      85607
838      75730
51      165855
570      80370
49      134106
1374     44104
439      90604
602     107985
614      78379
906     114396
1070     95952
1052     69586
429      91374
1232     47997
Name: salary, dtype: int64

In [29]:
y_pred[:50]

array([ 87180., 141916.,  94012.,  67100.,  67100., 111868.,  83100.,
       105996., 119196.,  74684.,  94012.,  94012., 126428., 126428.,
       203500., 111868.,  67100., 116380., 168124.,  87180., 126428.,
       111868.,  67100., 123564.,  94012.,  23772., 100220.,  67100.,
       119196.,  67100., 119196.,  31356., 166172., 105996., 119196.,
        87180.,  94012.,  67100., 171484.,  81868., 151404.,  23772.,
        81868., 100220.,  81868., 119196., 100220.,  74684.,  94012.,
        31356.])