In [1]:
import pandas as pd
import numpy as np
import xgboost
from sklearn import datasets, preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder

## Import Data

In [2]:
#import data
input_features = pd.read_csv('DSciHomeworkAssignmentV2/train_features.csv')
input_salaries = pd.read_csv('DSciHomeworkAssignmentV2/train_salaries.csv')
input_features.head()

## Preprocessing

In [3]:
company_id = input_features['companyId'].apply(lambda x: int(x[4:]))
input_features['companyId'] = company_id

#encode jobType, degree, major, industry
input_features = pd.get_dummies(input_features, columns=['jobType', 'degree', 'major', 'industry'])

#standardize yearsExperience, milesFromMetropolis
input_features['yearsExperience'] = preprocessing.scale(input_features['yearsExperience'])
input_features['milesFromMetropolis'] = preprocessing.scale(input_features['milesFromMetropolis'])
input_features.head()



Unnamed: 0,jobId,companyId,yearsExperience,milesFromMetropolis,jobType_CEO,jobType_CFO,jobType_CTO,jobType_JANITOR,jobType_JUNIOR,jobType_MANAGER,...,major_MATH,major_NONE,major_PHYSICS,industry_AUTO,industry_EDUCATION,industry_FINANCE,industry_HEALTH,industry_OIL,industry_SERVICE,industry_WEB
0,JOB1362684407687,37,-0.276245,1.159051,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,JOB1362684407688,19,-1.246797,0.812763,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,JOB1362684407689,52,-0.276245,-0.399244,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,JOB1362684407690,38,-0.553546,-1.126448,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,JOB1362684407691,7,-0.553546,-1.161077,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [9]:
#create training & validation sets
data = pd.merge(input_features, input_salaries, how = "left", on= "jobId")
train =  data.sample(frac=0.80,random_state= 100)
validate = data.drop(train.index)

train_x = train.iloc[:, 2:33]
train_y = train.iloc[:, 33]

validate_x = validate.iloc[:, 2:33]
validate_y = validate.iloc[:, 33]

print("Preprocessing Complete")

Preprocessing Complete


## Prediction

In [30]:
#model training
#model = ExtraTreesRegressor(n_estimators=100)
#model = GradientBoostingRegressor(min_samples_leaf= 4, learning_rate= 0.1, max_depth= 4)
model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,               
                 learning_rate=0.1,
                 max_depth=6,
                 min_child_weight=1.5,
                 n_estimators=500,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
## 18.87
model.fit(train_x,train_y)
x = model.feature_importances_
print("ranking",x)

predictions = model.predict(validate_x)
print("salary_predictions",predictions)
print('Root Mean Squared Error: ', np.sqrt(mean_squared_error(validate_y, predictions)))
print('Variance Explained (R^2): ', r2_score(validate_y, predictions))

ranking [0.13966116 0.15562554 0.02747611 0.02859832 0.0263177  0.0085795
 0.02418187 0.02620909 0.02269765 0.02606429 0.035585   0.03667101
 0.01299595 0.03102375 0.01516797 0.02251665 0.02233565 0.01943962
 0.01737619 0.02523168 0.02143064 0.0174848  0.01730379 0.0181726
 0.02874312 0.02497828 0.02997394 0.0270779  0.03301477 0.02928613
 0.02877932]
salary_predictions [116.07314  137.46407  125.65225  ... 127.916245 119.1691    63.41762 ]
Root Mean Squared Error:  18.828710483465915
Variance Explained (R^2):  0.7635836977521642


In [29]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

target = 'loss'
del train[target]
del validate[target]

def objective(space):
    model = xgboost.XGBRegressor(n_estimators = 100,
                            max_depth = space['max_depth'],
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'])

    eval_set  = [( train_x, train_y), ( validate_x, validate_y)]

    model.fit(train_x, train_y,eval_set=eval_set,
            eval_metric = 'mae')
    predictions = model.predict(validate_x)
    mse = np.sqrt(mean_squared_error(validate_y, predictions))
    print('MSE:', mse)
    return{'loss':mse, 'status': STATUS_OK }


space ={
        'max_depth': hp.quniform("x_max_depth", 5, 10, 1),
        'min_child_weight': hp.quniform ('x_min_child', 1, 10, 1),
        'subsample': hp.uniform ('x_subsample', 0.8, 1)
    }


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=3,
            trials=trials)

print(best)

KeyError: 'loss'