# Read and Prepare the Data

In [1]:
#Common Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDRegressor

In [2]:
jobs = pd.read_csv('jobs.csv')

In [3]:
jobs.head(5)

Unnamed: 0,Salary,Job Description
0,67206,Civil Service Title: Regional Director Mental ...
1,88313,The New York City Comptrollerâ€™s Office Burea...
2,81315,With minimal supervision from the Deputy Commi...
3,76426,OPEN TO CURRENT BUSINESS PROMOTION COORDINATOR...
4,55675,Only candidates who are permanent in the Princ...


In [4]:
target = jobs['Salary']

In [5]:
jobs[['Salary']].isna().sum()

Salary    0
dtype: int64

In [6]:
input_data = jobs['Job Description']

In [7]:
train_set, test_set, train_y, test_y = train_test_split(input_data, target, test_size=0.3, random_state=42)

In [8]:
train_set.shape, train_y.shape
test_set.shape, test_y.shape

((724,), (724,))

In [9]:
#Countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english')
train_x_tr = count_vect.fit_transform(train_set)
test_x_tr = count_vect.transform(test_set)

In [10]:
#Performing the tf-idf fit_transformation on the train data set
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer()
train_x_tfidf = tf_transformer.fit_transform(train_x_tr)

In [11]:
test_x_tfidf = tf_transformer.transform(test_x_tr)

In [12]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, n_iter=10)

In [13]:
train_x_lsa = svd.fit_transform(train_x_tfidf)
train_x_lsa.shape

(1689, 500)

In [14]:
#Selecting and sorting the first component
first_component = svd.components_[0,:]
indeces = np.argsort(first_component).tolist()
print(indeces)

[6450, 7432, 9466, 1166, 6843, 1749, 4530, 5531, 6584, 2920, 7336, 7992, 2383, 2467, 1194, 7386, 3263, 7532, 9158, 719, 7831, 2970, 5785, 9193, 7677, 5409, 1535, 2267, 8404, 9103, 5804, 5334, 7140, 5122, 5198, 9591, 9579, 1934, 2539, 9887, 1719, 9889, 3450, 9888, 3742, 88, 7775, 2939, 1439, 7494, 2588, 3826, 6239, 1646, 6276, 9495, 5498, 9706, 6310, 8386, 5655, 4935, 2157, 691, 692, 8148, 2811, 8072, 4528, 3010, 3478, 702, 8070, 8069, 8723, 4625, 5633, 5631, 9728, 2565, 6235, 5912, 3739, 7945, 5586, 2566, 3803, 9641, 4725, 4687, 2925, 1310, 1201, 9091, 9708, 149, 9696, 361, 8207, 8182, 1195, 5413, 7180, 8161, 8447, 6377, 9729, 8843, 3502, 3501, 7489, 3847, 9723, 9730, 6371, 8803, 5588, 9661, 4709, 4703, 434, 4166, 3302, 5399, 9612, 9790, 2929, 8036, 4120, 6128, 1447, 9604, 7531, 84, 1795, 2927, 3943, 8993, 8375, 1237, 7546, 5977, 5123, 4311, 5737, 651, 3162, 3900, 9284, 4070, 8941, 7379, 8201, 8001, 3604, 2608, 5927, 2135, 7871, 4310, 104, 7436, 9229, 7306, 2593, 4643, 6592, 1858, 5861

In [15]:
#Printing 10 terms that have the highest weigths
feat_names = count_vect.get_feature_names()
for index in indeces[-10:]:
    print(feat_names[index], "\t\tweight =", first_component[index])

bureau 		weight = 0.10717931879805558
management 		weight = 0.1096076543764516
new 		weight = 0.12230386159112747
design 		weight = 0.1298619525489696
city 		weight = 0.13608521123029885
project 		weight = 0.13846106935768074
dep 		weight = 0.14965285958787902
construction 		weight = 0.15479839429867337
wastewater 		weight = 0.15841529884565175
water 		weight = 0.2642504496809518


In [16]:
test_x_lsa = svd.transform(test_x_tfidf)
test_x_lsa.shape

(724, 500)

In [17]:
# Average value of the target

mean_value = np.mean(target)
mean_value

77990.33029423954

In [18]:
# Predicting all values as the mean

baseline_pred = np.repeat(mean_value, len(target))
baseline_pred

array([77990.33029424, 77990.33029424, 77990.33029424, ...,
       77990.33029424, 77990.33029424, 77990.33029424])

In [19]:
# Calculating Baseline RMSE
baseline_mse = mean_squared_error(target, baseline_pred)
baseline_rmse = np.sqrt(baseline_mse)
print('Baseline RMSE: {}' .format(baseline_rmse))

Baseline RMSE: 29196.68788150113


# Model 1
### Use any model that we have covered so far

In [20]:
#Trying RandomForestRegressor
rnd_reg = RandomForestRegressor(n_estimators=100, max_leaf_nodes=16, n_jobs=-1) 

rnd_reg.fit(train_x_lsa, train_y)



RandomForestRegressor(max_leaf_nodes=16, n_jobs=-1)

In [21]:
#Train RMSE for RandomForestRegressor Model
reg_train_pred = rnd_reg.predict(train_x_lsa)
train_mse = mean_squared_error(train_y, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_y, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 20072.716931564995


In [22]:
#Test RMSE for RandomForestRegressor Model
reg_test_pred = rnd_reg.predict(test_x_lsa)

test_mse = mean_squared_error (test_y, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_y, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 23399.17623628251


In [23]:
#Trying SGDRegressor with GridSearch


from sklearn.model_selection import RandomizedSearchCV

sgd_reg = SGDRegressor(max_iter=100000, penalty='elasticnet', alpha = 0.001)

param_grid = [
    {'l1_ratio':(0.01,0.99),'eta0':(0.001,1),'tol':(0.0001,0.1) }
  ]

sgd_reg = SGDRegressor()

grid_search = RandomizedSearchCV(sgd_reg, param_grid, cv=3, n_iter=10,
                           scoring='neg_mean_squared_error', verbose=1,
                           return_train_score=True)

In [24]:
grid_search.fit(train_x_lsa, train_y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   15.5s finished


RandomizedSearchCV(cv=3, estimator=SGDRegressor(),
                   param_distributions=[{'eta0': (0.001, 1),
                                         'l1_ratio': (0.01, 0.99),
                                         'tol': (0.0001, 0.1)}],
                   return_train_score=True, scoring='neg_mean_squared_error',
                   verbose=1)

In [25]:
#Train RMSE for SGDRegressor Model
reg_train_pred = grid_search.predict(train_x_lsa)
train_mse = mean_squared_error(train_y, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_y, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 15993.007851848408


In [26]:
#Test RMSE for SGDRegressor Model with L1 Regularization
reg_test_pred = grid_search.predict(test_x_lsa)

test_mse = mean_squared_error (test_y, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_y, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 19997.65984688136


# Model 2
### Use any model that we have covered so far

In [27]:
import nltk
from nltk.corpus import stopwords
import re

In [28]:
#Creating a blank list

new_train = []

# For each row in train_set, we will read the text, tokenize it, remove stopwords, lemmatize it, 
# and save it to the new list

for text in train_set:
    text = re.sub(r'[!"#$%&()*+,-./:;<=>?[\]^_`{|}~]', ' ', text).lower()
        
    words= nltk.tokenize.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if len(w)>2 and w not in stopwords.words('english')]
        
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    new_train.append(' '.join(words))

In [29]:
# Let's convert the original train_set to a dataframe

train_set_df = pd.DataFrame(train_set)

train_set_df['new_text'] = new_train

train_set_df

Unnamed: 0,Job Description,new_text
429,Only candidates who are permanent in the Compu...,candidate permanent computer system manager ti...
1185,NYCERS is seeking a Business Analyst with a te...,nycers seeking business analyst technical back...
2116,The NYC Department of Environmental Protection...,nyc department environmental protection dep pr...
2127,Only Candidates permanent in the Assistant Civ...,candidate permanent assistant civil engineer t...
458,Please read this posting carefully to make cer...,please read posting carefully make certain mee...
...,...,...
1638,NYC Civilian Complaint Review Board The Civil...,nyc civilian complaint review board civilian c...
1095,The NYC Department of Environmental Protection...,nyc department environmental protection dep en...
1130,The NYC Office of Payroll Administration is re...,nyc office payroll administration recruiting i...
1294,HPDTech is the IT division within HPD. Its mis...,hpdtech division within hpd mission identify a...


In [30]:
# Performing Same Operation for test data 

new_test = []

for text in test_set:
    text = re.sub(r'[!"#$%&()*+,-./:;<=>?[\]^_`{|}~]', ' ', text).lower()
        
    words= nltk.tokenize.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if len(w)>2 and w not in stopwords.words('english')]
        
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    new_test.append(' '.join(words))



test_set_df = pd.DataFrame(test_set)

test_set_df['new_text'] = new_test

test_set_df

Unnamed: 0,Job Description,new_text
765,The New York City Housing Authority (NYCHA) is...,new york city housing authority nycha largest ...
2387,"Hiring Rate: $62,272.00 (Flat Rate-Annual) ...",hiring rate flat rate annual mission bureau hi...
2162,The Executive Director for Regulatory Reform w...,executive director regulatory reform assist im...
1833,The NYC Department of Environmental Protection...,nyc department environmental protection dep pr...
1814,The Department of Transportationâ€™s (DOT) mis...,department dot mission provide safe efficient ...
...,...,...
2333,The Family Independence Administration/ Office...,family independence administration office rese...
998,In order to be considered for this position ca...,order considered position candidate must servi...
891,In accordance to Local Law 196 established in ...,accordance local law established late sb devel...
1866,About New York City Cyber Command NYC Cyber Co...,new york city cyber command nyc cyber command ...


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=600)
train_x_tr = count_vect.fit_transform(train_set_df['new_text'])

In [32]:
test_x_tr = count_vect.transform(test_set_df['new_text'])

In [33]:
train_x_tr, test_x_tr

(<1689x600 sparse matrix of type '<class 'numpy.int64'>'
 	with 156176 stored elements in Compressed Sparse Row format>,
 <724x600 sparse matrix of type '<class 'numpy.int64'>'
 	with 68613 stored elements in Compressed Sparse Row format>)

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer()

train_x_tfidf = tf_transformer.fit_transform(train_x_tr)

train_x_tfidf.shape

(1689, 600)

In [35]:
#Performing the tf-idf transformation on the test data set

test_x_tfidf = tf_transformer.transform(test_x_tr)

test_x_tfidf.shape

(724, 600)

In [36]:
#RandomForestRegressor with Grid Search
param_grid = [
    {'n_estimators':(100,1000),'max_leaf_nodes':(100,500) }
  ]

rnd_reg_2 = RandomForestRegressor( n_jobs=-1) 

rnd_reg_2 = RandomizedSearchCV(rnd_reg_2, param_grid, n_jobs=-1, cv=3, n_iter=5,
                           scoring='neg_mean_squared_error', verbose=1,
                           return_train_score=True)

rnd_reg_2.fit(train_x_tfidf, train_y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:   18.5s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  3.2min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(n_jobs=-1), n_iter=5,
                   n_jobs=-1,
                   param_distributions=[{'max_leaf_nodes': (100, 500),
                                         'n_estimators': (100, 1000)}],
                   return_train_score=True, scoring='neg_mean_squared_error',
                   verbose=1)

In [37]:
#Train RMSE for RandomForestRegressor Model
reg_train_pred = rnd_reg_2.predict(train_x_tfidf)
train_mse = mean_squared_error(train_y, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_y, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 6669.5416082479705


In [38]:
#Test RMSE for RandomForestRegressor Model
reg_test_pred = rnd_reg_2.predict(test_x_tfidf)

test_mse = mean_squared_error (test_y, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_y, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 15721.254777332686


In [39]:
#SGDRegressor Model
sgd_reg2 = SGDRegressor(max_iter=100000, alpha = 0.001, 
                          eta0=0.1, tol=0.0001)

In [40]:
sgd_reg2.fit(train_x_tfidf, train_y)

SGDRegressor(alpha=0.001, eta0=0.1, max_iter=100000, tol=0.0001)

In [41]:
#Train RMSE for SGDRegressor Model
reg_train_pred = sgd_reg2.predict(train_x_tfidf)
train_mse = mean_squared_error(train_y, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_y, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 18541.71083496454


In [42]:
#Test RMSE for SGDRegressor Model
reg_test_pred = sgd_reg2.predict(test_x_tfidf)

test_mse = mean_squared_error (test_y, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_y, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 20902.179605316418


# Model 3

In [43]:
from tensorflow.keras.preprocessing.text import Tokenizer

keras_tokenizer = Tokenizer(num_words=250, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)

keras_tokenizer.fit_on_texts(train_set)

In [44]:
train_binary_matrix = keras_tokenizer.texts_to_matrix(train_set, mode='tfidf')
test_binary_matrix = keras_tokenizer.texts_to_matrix(test_set, mode='tfidf')

In [45]:
#RandomForestRegressor with Grid Search

param_grid = [
    {'n_estimators':(100,1000),'max_leaf_nodes':(100,500),'max_depth':(10,100) } ]

rnd_reg_3 = RandomForestRegressor( n_jobs=-1) 

rnd_reg_3 = RandomizedSearchCV(rnd_reg_3, param_grid, n_jobs=-1, cv=3, n_iter=5,
                           scoring='neg_mean_squared_error', verbose=1,
                           return_train_score=True)

rnd_reg_3.fit(train_binary_matrix, train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    7.0s remaining:    6.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   39.6s finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(n_jobs=-1), n_iter=5,
                   n_jobs=-1,
                   param_distributions=[{'max_depth': (10, 100),
                                         'max_leaf_nodes': (100, 500),
                                         'n_estimators': (100, 1000)}],
                   return_train_score=True, scoring='neg_mean_squared_error',
                   verbose=1)

In [47]:
#Train RMSE for RandomForestRegressor Model
reg_train_pred = rnd_reg_3.predict(train_binary_matrix)
train_mse = mean_squared_error(train_y, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_y, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 6582.312273696255


In [49]:
#Test RMSE for RandomForestRegressor Model
reg_test_pred = rnd_reg_3.predict(test_binary_matrix)

test_mse = mean_squared_error (test_y, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_y, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 16450.196126951978


In [50]:
#SGDRegressor
sgd_reg3 = SGDRegressor(max_iter=10000, alpha = 0.001, 
                          eta0=0.001, tol=0.0001)

In [51]:
sgd_reg3.fit(train_binary_matrix, train_y)

SGDRegressor(alpha=0.001, eta0=0.001, max_iter=10000, tol=0.0001)

In [52]:
#Train RMSE for SGDRegressor Model
reg_train_pred = sgd_reg3.predict(train_binary_matrix)
train_mse = mean_squared_error(train_y, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_y, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 20109.80349443754


In [53]:
#Test RMSE for SGDRegressor Model 
reg_test_pred = sgd_reg3.predict(test_binary_matrix)

test_mse = mean_squared_error (test_y, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_y, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 22400.715111784746
