In [43]:
import numpy
import pandas
import sklearn.ensemble
import sklearn.feature_extraction
import sklearn.linear_model
import sklearn.model_selection

#import xgboost - Not working on Rob's laptop

print("Done")

Done


In [2]:
# General setup.

# Return mean absolute error scale.
def mean_absolute_error_salary_scale(y_test, y_predicted):
    return sklearn.metrics.mean_absolute_error(
        numpy.exp(y_test), numpy.exp(y_predicted))

In [50]:
# Define training and test data.

data = pandas.read_csv('data/train_large.csv')

X_train_index, X_test_index, Y_train, Y_test = sklearn.model_selection.train_test_split(
    data.index, data['LogSalaryNormalized'], test_size=.3, random_state=42)

# Define LDA data.
lda_data = pandas.read_csv('data/train_large_lda_50.csv')

X_train_lda = lda_data.iloc[X_train_index]
X_test_lda = lda_data.iloc[X_test_index]

# Define BOW data.
X_train_bow = data.iloc[X_train_index]
X_test_bow = data.iloc[X_test_index]

count_vect = sklearn.feature_extraction.text.CountVectorizer(
    stop_words='english', min_df=5, decode_error='ignore')

# Transform BOW test set.
X_train_counts = count_vect.fit_transform(X_train_bow['FullDescriptionWithTitle'])

In [13]:
# Model: Random Forests
# Data Representation: LDA
# Library used: sklearn

# NOTE: Have since retrained LDA using sklearn and online mini-batch learning.

rf = sklearn.ensemble.RandomForestRegressor(
    n_estimators=1000, n_jobs=-1, warm_start=True)

rf.fit(X_train_lda, Y_train)
rf_predictions = rf.predict(X_test_lda)

# Scale to get Mean Absolute Error
rf_mae = mean_absolute_error_salary_scale(Y_test, rf_predictions)

print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))

Random Forest Regressor Mean Absolute Error:  8380.5428


### Interesting note: Using LDA, n_estimators = 10 performs better than 1000

In [None]:
# Model: XGBoost
# Data Representation: LDA
# Library used: xgboost

# NOTE: xgboost will not import on Rob's laptop.

xgb = xgboost.XGBRegressor(
        learning_rate=.10, max_depth=10, n_estimators=1000, silent=False,
        subsample=.8, colsample_bytree=.8, nthread=4, min_child_weight=.5)

In [26]:
# Model: SGD
# Data Representation: LDA
# Library used: sklearn

# Normalization not required as LDA output is normalised between 0 and
# 1 anyway.

# We want a stochastic gradient descent with l1 norm.
sgd = sklearn.linear_model.SGDRegressor(
    alpha=.0001, penalty='l1', n_iter=10000)
sgd.fit(X_train_lda, Y_train)
sgd_predictions = sgd.predict(X_test_lda)
sgd_mae = mean_absolute_error_salary_scale(Y_test, sgd_predictions)
print('SGDRegressor Mean Absolute Error: {:10.4f}'.format(sgd_mae))

SGDRegressor Mean Absolute Error:  8929.7799


In [48]:
# Model: Random Forests using batch learning
# Data Representation: LDA
# Library used: sklearn

def batch_processed_rf(X_train, Y_train, batch_size=1000, n_estimators_per_batch=100):
    """Function to process random forest in batches using warm start."""
    
    def run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows):     
        """Function to run next batch of random forest."""
        processed_rows = total_processed_rows + len(X_train_batch)
        rf.set_params(n_estimators=n_estimators)
        rf.fit(X_train_batch, Y_train_batch)
        rf_predictions = rf.predict(X_test_lda)
        rf_mae = mean_absolute_error_salary_scale(Y_test, rf_predictions)
        print('Number of rows trained on: {:10.4f}'.format(processed_rows))
        print('Number of trees in forest: {:10.4f}'.format(n_estimators))
        print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))
        print('')
    
    n_estimators = n_estimators_per_batch
    total_processed_rows = 0
    
    # Define model.
    rf = sklearn.ensemble.RandomForestRegressor(
    n_estimators=n_estimators_per_batch, n_jobs=-1, warm_start=True)
    
    # Reindex training and test data.
    # Should randomize here, but we have already taken this step, so no need in our case.
    X_train.reset_index(drop=True)
    Y_train.reset_index(drop=True)
    
    while total_processed_rows < len(X_train) - batch_size:     
        # Scale to get Mean Absolute Error
        batch_index = range(total_processed_rows, total_processed_rows + batch_size)
        X_train_batch = X_train.iloc[batch_index]
        Y_train_batch = Y_train.iloc[batch_index]
        run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)
        total_processed_rows += batch_size
        n_estimators += n_estimators_per_batch
        
    remaining_rows = len(X_train) - total_processed_rows
    if remaining_rows == 0:
        return
    batch_index = range(total_processed_rows, len(X_train))
    X_train_batch = X_train.iloc[batch_index]
    Y_train_batch = Y_train.iloc[batch_index]
    run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)

batch_processed_rf(X_train_lda, Y_train, batch_size=1000, n_estimators_per_batch=100)

Number of rows trained on:  1000.0000
Number of trees in forest:   100.0000
Random Forest Regressor Mean Absolute Error:  9446.9799

Number of rows trained on:  2000.0000
Number of trees in forest:   200.0000
Random Forest Regressor Mean Absolute Error:  9244.8231

Number of rows trained on:  3000.0000
Number of trees in forest:   300.0000
Random Forest Regressor Mean Absolute Error:  9130.0379

Number of rows trained on:  4000.0000
Number of trees in forest:   400.0000
Random Forest Regressor Mean Absolute Error:  9102.5227

Number of rows trained on:  5000.0000
Number of trees in forest:   500.0000
Random Forest Regressor Mean Absolute Error:  9106.9557

Number of rows trained on:  6000.0000
Number of trees in forest:   600.0000
Random Forest Regressor Mean Absolute Error:  9113.0142

Number of rows trained on:  7000.0000
Number of trees in forest:   700.0000
Random Forest Regressor Mean Absolute Error:  9102.6697

Number of rows trained on:  8000.0000
Number of trees in forest:   80

### Note: With the same data, training in batch results in a higher error

In [47]:
# Model: Random Forests using batch learning
# Data Representation: Bag of Words
# Library used: sklearn

# NOTE: This is just to get this set up and working.
# TODO: Move this to be a single function under 'Random Forests' where you simply define the
# model as being BOW or LDA.

def batch_processed_rf_bow(X_train, Y_train, batch_size=1000, n_estimators_per_batch=100):
    """Function to process random forest in batches using warm start."""
    # Define model.
    n_estimators = n_estimators_per_batch
    total_processed_rows = 0
    
    rf = sklearn.ensemble.RandomForestRegressor(
        n_estimators=n_estimators_per_batch, n_jobs=-1, warm_start=True)
   
    # Reindex training and test data.
    # Should randomize here, but we have already taken this step, so no need in our case.
    X_train.reset_index(drop=True)
    Y_train.reset_index(drop=True)
    
    count_vect = sklearn.feature_extraction.text.CountVectorizer(
        stop_words='english', min_df=5, decode_error='ignore')

    # Transform BOW test set.
    X_test_counts = count_vect.transform(X_test_bow)
    
    def run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows):     
        """Function to run next batch of random forest."""
        # Transform the batch.
        X_train_counts = count_vect.fit_transform(X_train_batch)
        processed_rows = total_processed_rows + len(X_train_batch)
        rf.set_params(n_estimators=n_estimators)
        rf.fit(X_train_counts, Y_train_batch)
        rf_predictions = rf.predict(X_test_counts)
        rf_mae = mean_absolute_error_salary_scale(Y_test, rf_predictions)
        print('Number of rows trained on: {:10.4f}'.format(processed_rows))
        print('Number of trees in forest: {:10.4f}'.format(n_estimators))
        print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))
        print('')
    
    while total_processed_rows < len(X_train) - batch_size:     
        # Scale to get Mean Absolute Error
        batch_index = range(total_processed_rows, total_processed_rows + batch_size)
        X_train_batch = X_train.iloc[batch_index]
        Y_train_batch = Y_train.iloc[batch_index]
        run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)
        total_processed_rows += batch_size
        n_estimators += n_estimators_per_batch
        
    remaining_rows = len(X_train) - total_processed_rows
    if remaining_rows == 0:
        return
    batch_index = range(total_processed_rows, len(X_train))
    X_train_batch = X_train.iloc[batch_index]
    Y_train_batch = Y_train.iloc[batch_index]
    run_batch(X_train_batch, Y_train_batch, n_estimators, total_processed_rows)

batch_processed_rf(X_train_lda, Y_train, batch_size=1000, n_estimators_per_batch=100)

Number of rows trained on:  1000.0000
Number of trees in forest:   100.0000
Random Forest Regressor Mean Absolute Error:  9472.3956

Number of rows trained on:  2000.0000
Number of trees in forest:   200.0000
Random Forest Regressor Mean Absolute Error:  9253.1221

Number of rows trained on:  3000.0000
Number of trees in forest:   300.0000
Random Forest Regressor Mean Absolute Error:  9155.1866

Number of rows trained on:  4000.0000
Number of trees in forest:   400.0000
Random Forest Regressor Mean Absolute Error:  9114.2747

Number of rows trained on:  5000.0000
Number of trees in forest:   500.0000
Random Forest Regressor Mean Absolute Error:  9115.9774

Number of rows trained on:  6000.0000
Number of trees in forest:   600.0000
Random Forest Regressor Mean Absolute Error:  9123.1083

Number of rows trained on:  7000.0000
Number of trees in forest:   700.0000
Random Forest Regressor Mean Absolute Error:  9112.6177

Number of rows trained on:  8000.0000
Number of trees in forest:   80

In [98]:
COMPLETE_JOB_LISTING_TEMPLATE = """
    Job Title: %s\n
    Location: %s\n
    Company: %s\n
    Category: %s\n
    Contract Type: %s\n
    Contract Time: %s\n
    Full Description: %s
    """

def construct_complete_job_listing(dataset):
    listings = []
    fields = ['Title', 'LocationNormalized', 'Company', 'Category',
              'ContractType', 'ContractTime', 'FullDescription']
    for row in dataset.itertuples(name="Creaghs"):
        print(getattr(row, "Title"))
        print((field for field in fields,))
        #listings.append(COMPLETE_JOB_LISTING_TEMPLATE % 
        #        getattr(row, field) for field in fields)
        #print(index/len(dataset))
    return listings

#print(type(data))
print(construct_complete_job_listing(data))

Engineering Systems Analyst
<generator object construct_complete_job_listing.<locals>.<genexpr> at 0x110843eb8>
Engineering Systems Analyst Mathematical Modeller
<generator object construct_complete_job_listing.<locals>.<genexpr> at 0x110843eb8>
Pioneer Miser Engineering Systems Analyst
<generator object construct_complete_job_listing.<locals>.<genexpr> at 0x110843eb8>
Engineering Systems Analyst Water Industry
<generator object construct_complete_job_listing.<locals>.<genexpr> at 0x110843eb8>
Senior Subsea Pipeline Integrity Engineer
<generator object construct_complete_job_listing.<locals>.<genexpr> at 0x110843eb8>
RECRUITMENT CONSULTANT CONSTRUCTION TECHNICAL TRADES LABOUR
<generator object construct_complete_job_listing.<locals>.<genexpr> at 0x110843eb8>
PROJECT ENGINEER PHARMACEUTICAL
<generator object construct_complete_job_listing.<locals>.<genexpr> at 0x110843eb8>
Chef de Partie Award Winning Restaurant Excellent Tips
<generator object construct_complete_job_listing.<locals>.<g