In [15]:
# To default to float division and print function.
from __future__ import (division, print_function)

# Core python libraries
import csv
import re
import math

# External libraries.
import nltk
import numpy
import pandas
import scipy
import sklearn

# Tensorflow and related.
import tensorflow

# For fitting streamlined tensor flow models.
import tflearn

# So you know when this code block finishes.
print ("Done")

Done


In [80]:
#TODO (max): make a library that does all this preprocessing

data = pandas.read_csv('data/train_medium.csv')

LABEL_COLUMN = "HighPayingJobs"
data[LABEL_COLUMN] = (data["SalaryNormalized"].apply(lambda x: x >= 50000)).astype(int)

# This needs to be done as otherwise null/Nan/NA values will raise errors where
# we are expecting strings in our categorical variables
data["ContractTime"] = data["ContractTime"].fillna("unknown_value")
data["ContractType"] = data["ContractType"].fillna("unknown_value")

X_train_index, X_test_index, Y_train, Y_test = sklearn.cross_validation.train_test_split(
    data.index, data['LogSalaryNormalized'], test_size=.33, random_state=42)

# Keep train and test as pandas dataframes.
X_train = data.iloc[X_train_index]
X_test = data.iloc[X_test_index]

# Uncomment this next line if you want to check the data.
# print(data.head())

print ("Done")

         Id                                              Title  \
0  12612628                        Engineering Systems Analyst   
1  12613049  Engineering Systems Analyst / Mathematical Mod...   
2  12613647         Pioneer, Miser Engineering Systems Analyst   
3  13179816         Engineering Systems Analyst Water Industry   
4  14131336          Senior Subsea Pipeline Integrity Engineer   

                                     FullDescription  \
0  Engineering Systems Analyst Dorking Surrey Sal...   
1  Engineering Systems Analyst / Mathematical Mod...   
2  Pioneer, Miser  Engineering Systems Analyst Do...   
3  Engineering Systems Analyst Water Industry Loc...   
4  A globally renowned engineering and training c...   

                       LocationRaw LocationNormalized   ContractType  \
0          Dorking, Surrey, Surrey            Dorking  unknown_value   
1   Surrey, South East, South East             Surrey  unknown_value   
2   Surrey, South East, South East             Sur

In [17]:
MIN_WORD_FREQUENCY = 15

count_vect = sklearn.feature_extraction.text.CountVectorizer(
  stop_words='english', min_df=MIN_WORD_FREQUENCY)

# Here I am count vectoring the full description field. But in theory
# any text field can be processed this way.
X_train_full_description_counts = count_vect.fit_transform(X_train['FullDescription'])
X_test_full_description_counts = count_vect.transform(X_test['FullDescription'])

print(X_train_full_description_counts.shape)
print(X_test_full_description_counts.shape)
#TODO (max): supress the VisibleDeprecationWarning here from numpy. It is being thrown by the package.

(670, 786)
(330, 786)


In [18]:
def mean_absolute_error_salary_scale(y_test, y_predicted):
    return sklearn.metrics.mean_absolute_error(
        numpy.exp(y_test), numpy.exp(y_predicted))

# Guess the average. Create an empty vector of the desired shape.
average_guess = numpy.empty(Y_test.shape)
average_guess.fill(numpy.mean(Y_train))

mean_guess = average_guess[0]
print('Mean salary value in training set is  £{:10.2f}'.format(math.exp(mean_guess)))
average_guess_mae = mean_absolute_error_salary_scale(Y_test, average_guess)
print('Guess the average Mean Absolute Error: {:10.4f}'.format(average_guess_mae))

# SGD Needs normalized inputs
normalizer = sklearn.preprocessing.Normalizer(norm='l1')
X_train_norm = normalizer.fit_transform(X_train_full_description_counts.astype('float64'))
X_test_norm = normalizer.transform(X_test_full_description_counts.astype('float64'))

# We want a stochastic gradient descent with l1 norm.
sgd = sklearn.linear_model.SGDRegressor(alpha=.005, penalty='l1', n_iter=100)
sgd.fit(X_train_norm, Y_train)
sgd_predictions = sgd.predict(X_test_norm)
sgd_mae = mean_absolute_error_salary_scale(Y_test, sgd_predictions)
print('SGDRegressor Mean Absolute Error: {:10.4f}'.format(sgd_mae))
#TODO (any): wonder why this is so inaccurate/wrong

Mean salary value in training set is  £  25941.22
Guess the average Mean Absolute Error:  9396.2364
SGDRegressor Mean Absolute Error: 27970.8024


In [93]:
# Will try and do a logistic regression to predict High paying jobs as per
# https://www.tensorflow.org/versions/r0.10/tutorials/wide/index.html
# print(X_test[LABEL_COLUMN])

In [58]:
print(set(data['ContractType']))

set(['unknown_value', 'part_time', 'full_time'])


In [68]:
# feature_cols: A dict from feature column names to Tensors or SparseTensors.
# label: A Tensor containing the label column

# The keys of the feature_cols will be used to when construct columns in the next
# section. Because we want to call the fit and evaluate methods with different 
# data, we define two different input builder functions, train_input_fn and
# test_input_fn which are identical except that they pass different data to
# input_fn. Note that input_fn will be called while constructing the TensorFlow
# graph, not while running the graph. What it is returning is a representation
# of the input data as the fundamental unit of TensorFlow computations, a Tensor
# (or SparseTensor).

CONTINUOUS_COLUMNS = ["SalaryNormalized", "DescriptionLength"]

CATEGORICAL_COLUMNS = ["ContractTime", "ContractType"]

def input_fn(df):
  # Creates a dictionary mapping from each continuous feature column name (k) to
  # the values of that column stored in a constant Tensor.
  continuous_cols = {k: tensorflow.constant(df[k].values)
                     for k in CONTINUOUS_COLUMNS}
  # Creates a dictionary mapping from each categorical feature column name (k)
  # to the values of that column stored in a tensorflow.SparseTensor.
  categorical_cols = {k: tensorflow.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
  # Merges the two dictionaries into one.
  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
  # Converts the label column into a constant Tensor.
  label = tensorflow.constant(df[LABEL_COLUMN].values)
  # Returns the feature columns and the label.
  return feature_cols, label

def train_input_fn():
  return input_fn(X_train)

def eval_input_fn():
  return input_fn(X_test)


In [97]:
import tempfile

salary_normalized = tensorflow.contrib.layers.real_valued_column("SalaryNormalized")
description_length = tensorflow.contrib.layers.real_valued_column("DescriptionLength")
contract_time = tensorflow.contrib.layers.sparse_column_with_hash_bucket("ContractTime",
                                                                         hash_bucket_size=100)
contract_type = tensorflow.contrib.layers.sparse_column_with_hash_bucket("ContractType",
                                                                         hash_bucket_size=100)



model_dir = tempfile.mkdtemp()
m = tensorflow.contrib.learn.LinearClassifier(feature_columns=[
        salary_normalized,
        #description_length, contract_time, contract_type
    ],
    optimizer=tensorflow.train.FtrlOptimizer(
    learning_rate=0.1,
    l1_regularization_strength=1.0,
    l2_regularization_strength=1.0),
  model_dir=model_dir)

In [98]:
m.fit(input_fn=train_input_fn, steps=2000)


LinearClassifier()

In [100]:
results = m.evaluate(input_fn=eval_input_fn, steps=100)
for key in sorted(results):
    print ("%s: %s" % (key, results[key]))

accuracy: 0.942424
eval_auc: 0.5
loss: 8.97484
