In [1]:
# To default to float division
from __future__ import division

# Core python libraries
import csv
import re
import math
import collections

# External libraries.
import nltk
import numpy
import pandas
import scipy
import sklearn
# Sklearn is not eager in importing all of its subpackages. Have to explicity load the ones we want.
import sklearn.ensemble
import sklearn.model_selection


# So you know when this code block finishes.
print ("Done")

Done


In [2]:
LDA = False

data = pandas.read_csv('data/train.csv', nrows=1000)

X_train_index, X_test_index, Y_train, Y_test = sklearn.model_selection.train_test_split(
    data.index, data['LogSalaryNormalized'], test_size=.2, random_state=42)

# Keep train and test as pandas dataframes.
X_train = data.iloc[X_train_index]
X_test = data.iloc[X_test_index]

if LDA:
    lda_data = pandas.read_csv('data/train_large_lda_50.csv')

    X_train_lda = lda_data.iloc[X_train_index]
    X_test_lda = lda_data.iloc[X_test_index]


data.head()


Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName,TitleRaw,DescriptionLength,LogSalaryNormalized,FullDescriptionWithTitle
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,unknown_value,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk,Engineering Systems Analyst,348,10.126631,Engineering Systems Analyst Engineering System...
1,12613049,Engineering Systems Analyst Mathematical Modeller,Engineering Systems Analyst / Mathematical Mod...,"Surrey, South East, South East",Surrey,unknown_value,permanent,Gregory Martin International,Engineering Jobs,25000 - 30000/annum 25K-30K negotiable,27500,cv-library.co.uk,Engineering Systems Analyst / Mathematical Mod...,397,10.221941,Engineering Systems Analyst Mathematical Model...
2,12613647,Pioneer Miser Engineering Systems Analyst,"Pioneer, Miser Engineering Systems Analyst Do...","Surrey, South East, South East",Surrey,unknown_value,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk,"Pioneer, Miser Engineering Systems Analyst",216,10.126631,Pioneer Miser Engineering Systems Analyst Pion...
3,13179816,Engineering Systems Analyst Water Industry,Engineering Systems Analyst Water Industry Loc...,"Dorking, Surrey, Surrey, Surrey",Dorking,unknown_value,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20K to 30K,25000,cv-library.co.uk,Engineering Systems Analyst Water Industry,1602,10.126631,Engineering Systems Analyst Water Industry Eng...
4,14131336,Senior Subsea Pipeline Integrity Engineer,A globally renowned engineering and training c...,"Aberdeen, Borders",UK,unknown_value,permanent,Indigo 21 Ltd,Engineering Jobs,50000 - 100000/annum,75000,cv-library.co.uk,Senior Subsea Pipeline Integrity Engineer,1364,11.225243,Senior Subsea Pipeline Integrity Engineer A gl...


In [3]:
HASH = False

# All the words
MIN_WORD_FREQUENCY = 2

FIELD = 'FullDescription' # Previously 'Title'

# Creates a true word count
# TODO Beware the  "decode_error='ignore'" please resolve.
count_vect = sklearn.feature_extraction.text.CountVectorizer(
  stop_words='english',
    min_df=MIN_WORD_FREQUENCY, decode_error='ignore')


# Here I am count vectoring the full description field. But in theory
# any text field can be processed this way.

# For final model training, training on all the data.
# Fit on all the words and their frequencies.  But do not transform that as it is  meaningless.
# Then transform each training batch and the testing batch.
X_train_title_counts = count_vect.fit_transform(X_train[FIELD])
X_test_title_counts = count_vect.transform(X_test[FIELD])


print(X_train_title_counts.shape)
print(X_test_title_counts.shape)
#TODO (max): supress the VisibleDeprecationWarning here from numpy. It is being thrown by the package.


if HASH:
    # Creates a hashed count
    hash_vect = sklearn.feature_extraction.text.HashingVectorizer(
      stop_words='english', n_features=10000,  decode_error='ignore')

    # Here I am hash vectoring the full description field. But in theory
    # any text field can be processed this way.
    X_train_full_description_hashs = hash_vect.fit_transform(X_train[FIELD])
    X_test_full_description_hashs = hash_vect.transform(X_test[FIELD])

    print(X_train_full_description_hashs.shape)
    print(X_test_full_description_hashs.shape)

(800, 3629)
(200, 3629)


In [4]:
from scipy import sparse

#from sklearn.feature_extraction import DictVectorizer
from  sklearn import preprocessing

def binarize_column(column_name):
    lb = preprocessing.LabelBinarizer()
    train_output = lb.fit_transform(X_train[column_name])
    test_output = lb.transform(X_test[column_name])
    return train_output, test_output

#enc = preprocessing.OneHotEncoder()
#output = enc.fit_transform() 

train_contract_type, test_contract_type = binarize_column('ContractType')
train_contract_time, test_contract_time = binarize_column('ContractTime')
train_category, test_category = binarize_column('Category')

train_company, test_company = binarize_column('Company')
train_location, test_location = binarize_column('LocationNormalized')



X_train_categorical = numpy.concatenate((train_contract_type, train_contract_time, train_category, train_company, train_location), axis=1)
X_test_categorical = numpy.concatenate((test_contract_type, test_contract_time, test_category, test_company, test_location), axis=1)


print(X_train_categorical.shape)
print(X_test_categorical.shape)

# Need to use hstack when using a sparse matrix. As is the case with our title bag of words
X_training_set = sparse.hstack((X_train_categorical, X_train_title_counts))
X_testing_set = sparse.hstack((X_test_categorical, X_test_title_counts))

print(X_training_set.shape)

(800, 381)
(200, 381)
(800, 4010)


In [5]:
# print X_train[FIELD].str
new_column = X_train[FIELD]
new_column = new_column.str.replace(r'(?i)Speciliast', r'Specialist')


#X_train['Title'] = X_train['Title']
job_title_words = collections.Counter()
for title in new_column:
    for word in title.split(' '):
        job_title_words[str.lower(word)] +=1

for word, count in job_title_words.most_common():
    print("%d   %s" % (count, word)) 

print(len(job_title_words))

6641   and
5617   the
5305   to
4687   a
3520   of
3298   in
2827   for
2040   is
2039   care
2005   with
1709   be
1705   you
1696   ****
1682   will
1614   are
1472   home
1446   
1219   an
1214   as
1189   this
1071   have
1015   nursing
975   on
973   or
929   experience
917   we
868   work
825   within
800   nurse
755   working
749   your
719   manager
666   support
624   looking
620   their
586   all
567   role
562   please
538   at
538   our
525   registered
514   who
499   job
483   if
475   team
462   staff
449   service
441   must
425   client
422   that
411   apply
408   per
407   excellent
396   rgn
369   elderly
366   required
348   full
341   healthcare
332   successful
328   experienced
325   from
323   recruitment
311   management
310   health
310   training
309   part
305   provide
305   skills
301   candidate
287   opportunity
287   time
286   people
276   qualified
271   social
266   general
260   high
258   residents
255   hours
251   salary
250   worker
248   by
24

In [10]:
def modelfit(alg, dtrain, predictors, 
             useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

mean_guess = True
SGD = False
RF = False
XGBOOST = True


# These should be imported from elsewhere.
def original_salary_scale(log_value):
    return math.exp(log_value)

def mean_absolute_error_salary_scale(y_test, y_predicted):
    return sklearn.metrics.mean_absolute_error(
        numpy.exp(y_test), numpy.exp(y_predicted))

if mean_guess:
    # Guess the average. Create an empty vector of the desired shape.
    average_guess = numpy.empty(Y_test.shape)
    average_guess.fill(numpy.mean(Y_train))

    mean_guess = average_guess[0]
    print('Mean salary value in training set is  £{:10.2f}'.format(original_salary_scale(mean_guess)))
    average_guess_mae = mean_absolute_error_salary_scale(Y_test, average_guess)
    print('Guess the average Mean Absolute Error: {:10.4f}'.format(average_guess_mae))

if SGD:
    # SGD Needs normalized inputs
    # TODO: Suppress DataConversionWarning from converting ints to floats here.
    normalizer = sklearn.preprocessing.Normalizer(norm='l1')
    X_train_norm = normalizer.fit_transform(X_train_full_description_counts.astype('float64'))
    X_test_norm = normalizer.transform(X_test_full_description_counts.astype('float64'))

    # We want a stochastic gradient descent with l1 norm.
    sgd = sklearn.linear_model.SGDRegressor(alpha=.005, penalty='l1', n_iter=10)
    sgd.fit(X_train_norm, Y_train)
    sgd_predictions = sgd.predict(X_test_norm)
    sgd_mae = mean_absolute_error_salary_scale(Y_test, sgd_predictions)
    print('SGDRegressor Mean Absolute Error: {:10.4f}'.format(sgd_mae))
    #TODO (any): wonder why this is so inaccurate/wrong

if RF:
    # Random Forest.
    rf = sklearn.ensemble.RandomForestRegressor(n_estimators=10)
    #rf.fit(X_train_full_description_counts, Y_train)
    # TEST
    rf.fit(X_training_set, Y_train)


    rf_predictions = rf.predict(X_testing_set)

    # Mean Absolute Error
    rf_mae = mean_absolute_error_salary_scale(Y_test, rf_predictions)
    print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))
    
import xgboost

if XGBOOST:
    # XGBoost.
    model = xgboost.XGBRegressor(
        learning_rate=.15, max_depth=10, n_estimators=1000, silent=False,
        subsample=.8, colsample_bytree=.8, nthread=4, min_child_weight=.5)
    #rf.fit(X_train_full_description_counts, Y_train)
    # TEST
    print(model)
    #modelfit(model, X_training_set,)
    
    
    #pass
    
    model.fit(X_training_set, Y_train)


    predictions = model.predict(X_testing_set)

    # Mean Absolute Error
    mae = mean_absolute_error_salary_scale(Y_test, predictions)
    print('XGBoost Mean Absolute Error: {:10.4f}'.format(mae))
    print(type(X_training_set))
    """
    cvresult = xgboost.cv(
        model.get_xgb_params(), X_training_set,
        num_boost_round=model.get_params()['n_estimators'], nfold=5,
        metrics='auc', early_stopping_rounds=50)
    """
    """
    param_test = {
     'learning_rate':  [.01],
    }
    grid_search_model = xgboost.XGBRegressor(
        learning_rate=.15, n_estimators=1000, silent=False,
        subsample=.8, colsample_bytree=.8, nthread=4, min_child_weight=.5)
    print("Beginning grid search:")
    gsearch = sklearn.model_selection.GridSearchCV(estimator = grid_search_model, 
    param_grid=param_test, scoring='roc_auc', n_jobs=4, iid=False, cv=2)
    gsearch.fit(X_training_set, Y_train)
    
    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
    """


Mean salary value in training set is  £  25664.73
Guess the average Mean Absolute Error:  9701.4659
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.15, max_delta_step=0, max_depth=10,
       min_child_weight=0.5, missing=None, n_estimators=1000, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8)
XGBoost Mean Absolute Error:  4130.0493
<class 'scipy.sparse.coo.coo_matrix'>


In [44]:
# Check where the biggest errors are
error_tuples = []
for i, val in enumerate(Y_test):
    #print "%s %s" % (i, val)
    error = abs(rf_predictions[i] - Y_test.iloc[i])
    error_pound = abs(math.exp(rf_predictions[i]) - math.exp(Y_test.iloc[i]))
    title = X_test[FIELD].iloc[i]
    full_description =  X_test['FullDescription'].iloc[i]
    # Check the salary raw for some of the super low  
    predicted = math.exp(rf_predictions[i])
    actual = math.exp(Y_test.iloc[i])
    error_tuples.append((title, error, error_pound, predicted, actual, full_description))


In [45]:
error_tuples.sort(key=lambda x: x[1], reverse=True)

for a, b, c, d, e, f in error_tuples:
    print "%s   Title: %s    Predicted: %s Actual: %s \n %s \n" % (b, a, d, e, f)

1.70653879953   Title: Dyslexia Workplace Tutor    Predicted: 12196.3222088 Actual: 67200.0 
 This role includes travelling to a clients workplace who has hidden disabilties to provide coping strategies or technical training. Good knowledge of both PC MAC is a must have and good communication skills are essential. Teaching experience is also preferred. The right candiate must be patient with regards to disability. The role will also include training clients on any assistive technology, software or hardware that they have been provided with. Driving licence and own transport is essential due to the nature of this role. Willingness to travel up to **** miles also essentail. Full training will be provided for successful applicants. Please note this is a self employed vacancy. 

1.70588269775   Title: OOH GP Job Ipswich    Predicted: 26152.1289926 Actual: 144000.0 
 OOH GP Job Ipswich “MedCo (Europe) Ltd are currently looking for GP s to assist with our ever growing number of available Out

In [9]:
print(sgd.intercept_)
print(mean_guess)

[ 10.24537985]
10.2453808753


In [22]:
rf = sklearn.ensemble.RandomForestRegressor(n_estimators=100, n_jobs=-1, warm_start=True)

In [None]:
# If warm start is set we can add more estimators to our random forest
print("We currently have %s trees in our random forest" % len(rf.estimators_))

rf.n_estimators +=2000
if LDA:
    # Fit some models on the LDA output data to see how they perform.
    # Random Forest.
    rf.fit(X_train_lda, Y_train)
    rf_predictions = rf.predict(X_test_lda)

    # Mean Absolute Error
    rf_mae = mean_absolute_error_salary_scale(Y_test, rf_predictions)
    print('Random Forest Regressor Mean Absolute Error: {:10.4f}'.format(rf_mae))

We currently have 2250 trees in our random forest
