In [27]:
import pandas as pd
import numpy as np
import xgboost
from sklearn import datasets, preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder

## Import Data

In [28]:
#import data
input_features = pd.read_csv('train_features.csv')
input_salaries = pd.read_csv('train_salaries.csv')
input_features.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16


## Preprocessing

In [29]:
company_id = input_features['companyId'].apply(lambda x: int(x[4:]))
input_features['companyId'] = company_id

#encode jobType, degree, major, industry
input_features = pd.get_dummies(input_features, columns=['jobType', 'degree', 'major', 'industry'])

#standardize yearsExperience, milesFromMetropolis
input_features['yearsExperience'] = preprocessing.scale(input_features['yearsExperience'])
input_features['milesFromMetropolis'] = preprocessing.scale(input_features['milesFromMetropolis'])
input_features.head()



Unnamed: 0,jobId,companyId,yearsExperience,milesFromMetropolis,jobType_CEO,jobType_CFO,jobType_CTO,jobType_JANITOR,jobType_JUNIOR,jobType_MANAGER,...,major_MATH,major_NONE,major_PHYSICS,industry_AUTO,industry_EDUCATION,industry_FINANCE,industry_HEALTH,industry_OIL,industry_SERVICE,industry_WEB
0,JOB1362684407687,37,-0.276245,1.159051,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,JOB1362684407688,19,-1.246797,0.812763,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,JOB1362684407689,52,-0.276245,-0.399244,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,JOB1362684407690,38,-0.553546,-1.126448,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,JOB1362684407691,7,-0.553546,-1.161077,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [30]:
#create training & validation sets
data = pd.merge(input_features, input_salaries, how = "left", on= "jobId")
train =  data.sample(frac=0.80,random_state= 100)
validate = data.drop(train.index)

train_x = train.iloc[:, 2:33]
train_y = train.iloc[:, 33]

validate_x = validate.iloc[:, 2:33]
validate_y = validate.iloc[:, 33]

print("Preprocessing Complete")

Preprocessing Complete


## Prediction

In [31]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

def objective(space):
    model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma = space['gamma'],               
                 learning_rate = space['learning_rate'],
                 max_depth = space['max_depth'],
                 min_child_weight = space['min_child_weight'],
                 n_estimators = 500,                                                                    
                 reg_alpha = space['reg_alpha'],
                 reg_lambda = space['reg_lambda'],
                 subsample = space['subsample'],
                 seed = space['seed'])

    eval_set  = [( train_x, train_y), ( validate_x, validate_y)]

    model.fit(train_x, train_y,eval_set=eval_set,
            eval_metric = 'mae')
    predictions = model.predict(validate_x)
    mse = np.sqrt(mean_squared_error(validate_y, predictions))
    print('MSE:', mse)
    return{'loss':mse, 'status': STATUS_OK }


space ={
        'gamma' : 0,
        'learning_rate': hp.loguniform('learning_rate', 0.0001, 0.1),
        'max_depth': hp.choice('max_depth', np.arange(5, 10+1, dtype=int)),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'reg_alpha' : 0.75,
        'reg_lambda' : 0.45,
        'subsample': hp.uniform ('subsample', 0.6, 1),
        'seed': 42
    }


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=3,
            trials=trials)

print(best)

[0]	validation_0-mae:27.3455	validation_1-mae:27.37
[1]	validation_0-mae:21.2507	validation_1-mae:21.2832
[2]	validation_0-mae:21.1369	validation_1-mae:21.179
[3]	validation_0-mae:17.7151	validation_1-mae:17.7573
[4]	validation_0-mae:17.347	validation_1-mae:17.4199
[5]	validation_0-mae:16.5005	validation_1-mae:16.5782
[6]	validation_0-mae:16.3178	validation_1-mae:16.402
[7]	validation_0-mae:16.1774	validation_1-mae:16.2654
[8]	validation_0-mae:15.82	validation_1-mae:15.9204
[9]	validation_0-mae:15.6793	validation_1-mae:15.8081
[10]	validation_0-mae:15.6639	validation_1-mae:15.8123
[11]	validation_0-mae:15.6579	validation_1-mae:15.8092
[12]	validation_0-mae:15.649	validation_1-mae:15.8031
[13]	validation_0-mae:15.6338	validation_1-mae:15.8174


KeyboardInterrupt: 

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

#model training
#model = ExtraTreesRegressor(n_estimators=100)
#model = GradientBoostingRegressor(min_samples_leaf= 4, learning_rate= 0.1, max_depth= 4)
model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,               
                 learning_rate=0.1,
                 max_depth=2,
                 min_child_weight=8,
                 n_estimators=500,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
## 18.87
model.fit(train_x,train_y)
x = model.feature_importances_
print("ranking",x)

predictions = model.predict(validate_x)
print("salary_predictions",predictions)
print('Root Mean Squared Error: ', np.sqrt(mean_squared_error(validate_y, predictions)))
print('Variance Explained (R^2): ', r2_score(validate_y, predictions))