In [3]:
from sklearn import metrics
from xgboost.sklearn import XGBClassifier
from sklearn.externals import joblib

import numpy
import pandas
import sklearn.ensemble
import sklearn.feature_extraction
import sklearn.linear_model
import sklearn.model_selection
import tf_utils
import xgboost

print("Done")

Done


In [4]:
# General setup.

# Return mean absolute error scale.
def mean_absolute_error_salary_scale(y_test, y_predicted):
    return sklearn.metrics.mean_absolute_error(
        numpy.exp(y_test), numpy.exp(y_predicted))

def mae_without_log_scaling(y_test, y_predicted):
    return sklearn.metrics.mean_absolute_error(
        y_test, y_predicted)

In [11]:
# Define training and test data.

data = pandas.read_csv('data/train.csv')

X_train_index, X_test_index, Y_train, Y_test = (
    sklearn.model_selection.train_test_split(
        data.index, data['SalaryNormalized'],
        test_size=.3, random_state=42))

# Define BOW data.
TRAINING_COLUMN = 'CompleteJobListing'

X_train_bow = data.iloc[X_train_index]
X_test_bow = data.iloc[X_test_index]

count_vect = sklearn.feature_extraction.text.CountVectorizer(
    stop_words='english', min_df=50, decode_error='ignore')

# Transform BOW test set.
X_train_counts = count_vect.fit_transform(
    X_train_bow[TRAINING_COLUMN])
X_test_counts = count_vect.transform(
    X_test_bow[TRAINING_COLUMN])

# Normalize BOW
normalizer = sklearn.preprocessing.Normalizer(norm='l1')
X_train_norm = normalizer.fit_transform(X_train_counts.astype('float64'))
X_test_norm = normalizer.transform(X_test_counts.astype('float64'))
print(X_train_norm.shape)

print(X_test_counts.shape)

(137179, 10996)
(58792, 10996)


In [3]:
# Define training and test data using word2vec

word_2_vec_train_data = pandas.read_csv('word2vecdf.csv')
word_2_vec_target_data = pandas.read_csv('word2vectargets.csv')

X_train_w2v, X_test_w2v, Y_train_w2v, Y_test_w2v = (
    sklearn.model_selection.train_test_split(
        word_2_vec_train_data, word_2_vec_target_data,
        test_size=0.2, random_state=42))

print(X_train_w2v.shape)
print(X_test_w2v.shape)

(156776, 300)
(39194, 300)


In [5]:
# Define LDA data.
lda_raw = pandas.read_csv('data/train.csv')

lda_data = pandas.read_csv('data/train_lda_30.csv')
print(len(lda_raw))
print(len(lda_data))

X_train_lda_index, X_test_lda_index, Y_train_lda, Y_test_lda = (
    sklearn.model_selection.train_test_split(
        lda_raw.index, lda_raw['LogSalaryNormalized'],
        test_size=.3, random_state=42))

X_train_lda = lda_data.iloc[X_train_lda_index]
X_test_lda = lda_data.iloc[X_test_lda_index]

195971
195971


In [10]:
# Just guess the mean (baseline model)

guess_the_mean = tf_utils.MeanEstimator()

guess_the_mean.fit(X_train_bow, Y_train)
mean_predictions = guess_the_mean.predict(X_test_bow)

# Scale to get Mean Absolute Error
rf_mae = mae_without_log_scaling(Y_test, mean_predictions)
print(rf_mae)

13398.6388346


In [13]:
# Model: Random Forests
# Data Representation: LDA
# Library used: sklearn

# Tried increasing min_samples_leaf and min_samples_split, both did worse

rf = sklearn.ensemble.RandomForestRegressor(
    n_estimators=1000, n_jobs=-1, warm_start=True)

rf.fit(X_train_lda, Y_train_lda)
rf_predictions = rf.predict(X_test_lda)

# Scale to get Mean Absolute Error
rf_mae = mean_absolute_error_salary_scale(Y_test_lda, rf_predictions)

print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))

Random Forest Regressor Mean Absolute Error:  8380.5428


In [20]:
# Model: Random Forests using batch learning
# Data Representation: LDA
# Library used: sklearn

def batch_processed_rf(X_train, Y_train, batch_size=1000, n_estimators_per_batch=100):
    """Function to process random forest in batches using warm start."""
    
    def run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows):     
        """Function to run next batch of random forest."""
        processed_rows = total_processed_rows + len(X_train_batch)
        rf.set_params(n_estimators=n_estimators)
        rf.fit(X_train_batch, Y_train_batch)
        rf_predictions = rf.predict(X_test_lda)
        rf_mae = mean_absolute_error_salary_scale(Y_test, rf_predictions)
        print('Number of rows trained on: {:10.4f}'.format(processed_rows))
        print('Number of trees in forest: {:10.4f}'.format(n_estimators))
        print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))
        print('')
    
    n_estimators = n_estimators_per_batch
    total_processed_rows = 0
    
    # Define model.
    rf = sklearn.ensemble.RandomForestRegressor(
    n_estimators=n_estimators_per_batch, n_jobs=-1, warm_start=True)
    
    # Reindex training and test data.
    # Should randomize here, but we have already taken this step, so no need in our case.
    X_train.reset_index(drop=True)
    Y_train.reset_index(drop=True)
    
    while total_processed_rows < len(X_train) - batch_size:     
        # Scale to get Mean Absolute Error
        batch_index = range(total_processed_rows, total_processed_rows + batch_size)
        X_train_batch = X_train.iloc[batch_index]
        Y_train_batch = Y_train.iloc[batch_index]
        run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)
        total_processed_rows += batch_size
        n_estimators += n_estimators_per_batch
        
    remaining_rows = len(X_train) - total_processed_rows
    if remaining_rows == 0:
        return
    batch_index = range(total_processed_rows, len(X_train))
    X_train_batch = X_train.iloc[batch_index]
    Y_train_batch = Y_train.iloc[batch_index]
    run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)

batch_processed_rf(X_train_lda, Y_train, batch_size=1000, n_estimators_per_batch=100)

ValueError: Found input variables with inconsistent numbers of samples: [51416, 6000]

In [6]:
# Model: Random Forests using batch learning - Tried without batch
# learning, just ran forever.
# Data Representation: Bag of Words
# Library used: sklearn

# NOTE: This is just to get this set up and working.
# TODO: Move this to be a single function under 'Random Forests' where you simply define the
# model as being BOW or LDA.

def batch_processed_rf_bow(X_train, Y_train, batch_size=1000, n_estimators_per_batch=100):
    """Function to process random forest in batches using warm start."""
    # Define model.
    n_estimators = n_estimators_per_batch
    total_processed_rows = 0
    
    rf = sklearn.ensemble.RandomForestRegressor(
        n_estimators=n_estimators_per_batch, n_jobs=-1, warm_start=True)
   
    # Reindex training and test data.
    # Should randomize here, but we have already taken this step, so no need in our case.
    Y_train.reset_index(drop=True)
    
    def run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows):     
        """Function to run next batch of random forest."""
        # Transform the batch.
        processed_rows = total_processed_rows + len(X_train_batch)
        batch_X_train_count = count_vect.transform(X_train_batch[TRAINING_COLUMN])
        rf.set_params(n_estimators=n_estimators)
        rf.fit(batch_X_train_count, Y_train_batch)
        rf_predictions = rf.predict(X_test_counts)
        rf_mae = mean_absolute_error_salary_scale(Y_test, rf_predictions)
        print('Number of rows trained on: {:10.4f}'.format(processed_rows))
        print('Number of trees in forest: {:10.4f}'.format(n_estimators))
        print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))
        print('')
    
    while total_processed_rows < X_train.shape[0] - batch_size:     
        # Scale to get Mean Absolute Error
        batch_index = range(total_processed_rows, total_processed_rows + batch_size)
        X_train_batch = X_train.iloc[batch_index]
        Y_train_batch = Y_train.iloc[batch_index]
        run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)
        total_processed_rows += batch_size
        n_estimators += n_estimators_per_batch
        
    remaining_rows = X_train.shape[0] - total_processed_rows
    if remaining_rows == 0:
        return
    batch_index = range(total_processed_rows, len(X_train))
    X_train_batch = X_train.iloc[batch_index]
    Y_train_batch = Y_train.iloc[batch_index]
    run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)

batch_processed_rf_bow(X_train_bow, Y_train, batch_size=10000, n_estimators_per_batch=100)

Number of rows trained on: 10000.0000
Number of trees in forest:   100.0000
Random Forest Regressor Mean Absolute Error:  8663.3455

Number of rows trained on: 20000.0000
Number of trees in forest:   200.0000
Random Forest Regressor Mean Absolute Error:  8488.6272

Number of rows trained on: 30000.0000
Number of trees in forest:   300.0000
Random Forest Regressor Mean Absolute Error:  8437.6218

Number of rows trained on: 40000.0000
Number of trees in forest:   400.0000
Random Forest Regressor Mean Absolute Error:  8432.2712

Number of rows trained on: 50000.0000
Number of trees in forest:   500.0000
Random Forest Regressor Mean Absolute Error:  8406.4605

Number of rows trained on: 60000.0000
Number of trees in forest:   600.0000
Random Forest Regressor Mean Absolute Error:  8395.6568

Number of rows trained on: 70000.0000
Number of trees in forest:   700.0000
Random Forest Regressor Mean Absolute Error:  8396.6413

Number of rows trained on: 80000.0000
Number of trees in forest:   80

In [11]:
# Model: Random Forests using batch learning - Tried without batch
# learning, just ran forever.
# Data Representation: Word2vec
# Library used: sklearn

#100/200 its per batch: 9738/9736

def batch_processed_rf_bow(X_train, Y_train, batch_size=1000, n_estimators_per_batch=100):
    """Function to process random forest in batches using warm start."""
    # Define model.
    n_estimators = n_estimators_per_batch
    total_processed_rows = 0
    
    rf = sklearn.ensemble.RandomForestRegressor(
        n_estimators=n_estimators_per_batch, n_jobs=-1, warm_start=True)
   
    # Reindex training and test data.
    # Should randomize here, but we have already taken this step, so no need in our case.
    Y_train.reset_index(drop=True)
    
    def run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows):     
        """Function to run next batch of random forest."""
        processed_rows = total_processed_rows + len(X_train_batch)
        rf.set_params(n_estimators=n_estimators)
        rf.fit(X_train_batch, Y_train_batch)
        rf_predictions = rf.predict(X_test_w2v)
        rf_mae = mae_without_log_scaling(Y_test_w2v, rf_predictions)
        print('Number of rows trained on: {:10.4f}'.format(processed_rows))
        print('Number of trees in forest: {:10.4f}'.format(n_estimators))
        print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))
        print('')
    
    while total_processed_rows < X_train.shape[0] - batch_size:     
        # Scale to get Mean Absolute Error
        batch_index = range(total_processed_rows, total_processed_rows + batch_size)
        X_train_batch = X_train.iloc[batch_index]
        Y_train_batch = Y_train.iloc[batch_index]
        run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)
        total_processed_rows += batch_size
        n_estimators += n_estimators_per_batch
        
    remaining_rows = X_train.shape[0] - total_processed_rows
    if remaining_rows == 0:
        return
    batch_index = range(total_processed_rows, len(X_train))
    X_train_batch = X_train.iloc[batch_index]
    Y_train_batch = Y_train.iloc[batch_index]
    run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)

batch_processed_rf_bow(X_train_w2v, Y_train_w2v, batch_size=10452, n_estimators_per_batch=500)

KeyboardInterrupt: 

### Interesting note: Using LDA, n_estimators = 10 performs better than 1000

In [16]:
# Model: XGBoost
# Data Representation: BOW
# Library used: xgboost

# n_estimators set to 1000 achieved mae: 5932
# n_estimators set to 2000 achieved mae: 5693
# n_estimators set to 5000 achieved mae: 5507

# Model: XGBoost
# Data Representation: TF-IDF
# Library used: xgboost

# n_estimators set to 1000 achieved mae: 6059
# n_estimators set to 2000 achieved mae: 5827

xgb = xgboost.XGBRegressor(
       n_estimators=5000, learning_rate=.10, max_depth=10, silent=False,
       subsample=.8, colsample_bytree=.8, min_child_weight=.5)

xgb.fit(X_train_counts, Y_train)
xgb_predictions = xgb.predict(X_test_counts)

# Scale to get Mean Absolute Error
xgb_mae = mean_absolute_error_salary_scale(Y_test, xgb_predictions)

print('XGBoost Mean Absolute Error: {:10.4f}'.format(xgb_mae))

KeyboardInterrupt: 

In [8]:
# Model: XGBoost
# Data Representation: BOW - normalised
# Library used: xgboost
# Scale - Regular Scale

# n_estimators = 1000: 6074
# n_estimators = 2000: 5817
# n_estimators = 5601

#min_child_weight=1: no improvement

xgb = xgboost.XGBRegressor(
       n_estimators=5000, learning_rate=.10, max_depth=10, silent=False,
       subsample=.8, colsample_bytree=.8, min_child_weight=1)

xgb.fit(X_train_norm, Y_train)
xgb_predictions = xgb.predict(X_test_norm)

# Scale to get Mean Absolute Error
xgb_mae = mae_without_log_scaling(Y_test, xgb_predictions)

print('XGBoost Mean Absolute Error: {:10.4f}'.format(xgb_mae))

XGBoost Mean Absolute Error:  5601.7726


In [5]:
# Model: XGBoost
# Data Representation: word2vec
# Library used: xgboost
# Scale - Regular scale

# n_estimators=1000: 7020
# n_estimators=2000: 7010

xgb = xgboost.XGBRegressor(
       n_estimators=2000, learning_rate=.10, max_depth=10,
       subsample=.8, colsample_bytree=.8, min_child_weight=1)

xgb.fit(X_train_w2v, Y_train_w2v)
xgb_predictions = xgb.predict(X_test_w2v)

joblib.dump(xgb, 'xgb_w2v.pkl')

xgb_mae = mae_without_log_scaling(Y_test_w2v, xgb_predictions)

print('XGBoost Mean Absolute Error: {:10.4f}'.format(xgb_mae))

XGBoost Mean Absolute Error:  7010.3315


In [19]:
# Model: XGBoost
# Data Representation: LDA
# Library used: xgboost

# n_estimators=5000, learning_rate=.05, min_child_weight=1: 8132
# Tried lots of learning rates, different child weights, subsample and colsample_bytree sizes.
# Best combination: n_estimators=5000, learning_rate=.05, max_depth=10, silent=False,
# subsample=.8, colsample_bytree=.8, min_child_weight=1)

xgb = xgboost.XGBRegressor(
       n_estimators=5000, learning_rate=.05, max_depth=10, silent=False,
       subsample=.8, colsample_bytree=.8, min_child_weight=1)

xgb.fit(X_train_lda, Y_train_lda)
xgb_predictions = xgb.predict(X_test_lda)

# Scale to get Mean Absolute Error
xgb_mae = mean_absolute_error_salary_scale(Y_test_lda, xgb_predictions)

print('XGBoost Mean Absolute Error: {:10.4f}'.format(xgb_mae))

XGBoost Mean Absolute Error:  8295.2584


In [26]:
# Model: SGD
# Data Representation: LDA
# Library used: sklearn

# Normalization not required as LDA output is normalised between 0 and
# 1 anyway.

# We want a stochastic gradient descent with l1 norm.
sgd = sklearn.linear_model.SGDRegressor(
    alpha=.0001, penalty='l1', n_iter=10000)
sgd.fit(X_train_lda, Y_train)
sgd_predictions = sgd.predict(X_test_lda)
sgd_mae = mean_absolute_error_salary_scale(Y_test, sgd_predictions)
print('SGDRegressor Mean Absolute Error: {:10.4f}'.format(sgd_mae))

SGDRegressor Mean Absolute Error:  8929.7799


In [5]:
# Model: SGD
# Data Representation: BOW
# Library used: sklearn
# Scale: Log salary scale

normalizer = sklearn.preprocessing.Normalizer(norm='l1')
X_train_norm = normalizer.fit_transform(X_train_counts.astype('float64'))
X_test_norm = normalizer.transform(X_test_counts.astype('float64'))
print(X_train_norm.shape)

# We want a stochastic gradient descent with l1 norm.
sgd = sklearn.linear_model.SGDRegressor(
    alpha=.0001, penalty='l1', n_iter=10000)
sgd.fit(X_train_norm, Y_train)
sgd_predictions = sgd.predict(X_test_norm)
sgd_mae = mean_absolute_error_salary_scale(Y_test, sgd_predictions)
print('SGDRegressor Mean Absolute Error: {:10.4f}'.format(sgd_mae))

(120037, 10332)
SGDRegressor Mean Absolute Error: 10040.1751


In [28]:
# Model: SGD
# Data Representation: BOW
# Library used: sklearn
# Scale: Regular scale

X_train_index, X_test_index, Y_train, Y_test = (
    sklearn.model_selection.train_test_split(
        data.index, data['SalaryNormalized'],
        test_size=.3, random_state=42))

normalizer = sklearn.preprocessing.Normalizer(norm='l1')
X_train_norm = normalizer.fit_transform(X_train_counts.astype('float64'))
X_test_norm = normalizer.transform(X_test_counts.astype('float64'))
print(X_train_norm.shape)

# We want a stochastic gradient descent with l1 norm.
sgd = sklearn.linear_model.SGDRegressor(
    alpha=.0001, penalty='l1', n_iter=1000)
sgd.fit(X_train_norm, Y_train)
sgd_predictions = sgd.predict(X_test_norm)
sgd_mae = mae_without_log_scaling(Y_test, sgd_predictions)
print('SGDRegressor Mean Absolute Error: {:10.4f}'.format(sgd_mae))

(119970, 10027)
SGDRegressor Mean Absolute Error:  9408.0917


### Note: With the same data, training in batch results in a higher error

In [None]:
# XGBoost hyperparameter tuning using sklearn's Grid Search

PARAMATER = 'max_depth'
PARAMATER_VALUES = [4, 10]

paramaters = {
 PARAMATER:PARAMATER_VALUES
}

loss = sklearn.metrics.make_scorer(mean_absolute_error_salary_scale,
                    greater_is_better=False)

gsearch = sklearn.model_selection.GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1, n_estimators=1000, max_depth=5,
        min_child_weight=1.0, colsample_bytree=0.8, n_jobs=-1),
    param_grid = paramaters, n_jobs=-1,iid=True, cv=2, scoring=loss,
    pre_dispatch=1, verbose=10)

gsearch.fit(X_train_counts, Y_train)



Fitting 2 folds for each of 2 candidates, totalling 4 fits




[CV] max_depth=4 .....................................................


In [None]:
for i in range(0, len(PARAMATER_VALUES)):
    print(PARAMATER + ' ' + str(PARAMATER_VALUES[i]) + ':\n' +
          'Mean training absolute error: ' +
          str(gsearch.cv_results_['mean_train_score'][i]) + ';\n' +
         'Mean testing absolute error: ' +
          str(gsearch.cv_results_['mean_test_score'][i]) + ';\n')

print('Best paramater: ' + str(gsearch.best_params_))

xgb_cv_predictions = gsearch.predict(X_test_counts)

# Scale to get Mean Absolute Error
xgb_cv_mae = mean_absolute_error_salary_scale(Y_test, xgb_cv_predictions)

print('Mean Absolute Error for best paramater on cross validation data: {:10.4f}'.format(
        xgb_cv_mae))