In [4]:
# import related libraries
import numpy as np
from joblib import dump, load
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn import metrics

# set random seeds to ensure reproducibility
import random
random.seed(5500)
np.random.seed(5500)

In [None]:
# define some useful utility functions
def get_paths():
    paths = json.loads(open("SETTINGS.json").read())
    return paths

def identity(x):
    return x

# trigger the columns to be parsed as strings or specified data types
converters = { "FullDescription" : identity, "Title": identity, "LocationRaw": identity, "LocationNormalized": identity}

def get_train_df():
    train_path = get_paths()["train_data_path"]
    return pd.read_csv(train_path) # converters=converters

def get_valid_df():
    valid_path = get_paths()["valid_data_path"]
    return pd.read_csv(valid_path) # converters=converters

def get_test_df():
    test_path = get_paths()["test_data_path"]
    return pd.read_csv(test_path) # converters=converters

def save_model(model, filename):
    model_path = get_paths()["model_path"]
    dump(model, model_path + filename)

def load_model(filename):
    model_path = get_paths()["model_path"]
    return load(model_path + filename)

def write_submission(ids, predictions, filename):
    prediction_path = get_paths()["prediction_path"]
    output_dict = {"Id": ids, "SalaryNormalized": predictions}
    output = pd.DataFrame(output_dict)
    output.to_csv(prediction_path + filename, index=False)

In [None]:
# define a class FeatureMapper for extracting features for different columns automatically in the pipeline
class FeatureMapper:
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        for feature_name, column_name, extractor in self.features:
            extractor.fit(X[column_name], y)

    def transform(self, X):
        extracted = []
        for feature_name, column_name, extractor in self.features:
            fea = extractor.transform(X[column_name])
            if hasattr(fea, "toarray"):
                extracted.append(fea.toarray()) # convert sparse matrix into dense numpy ndarray
            else:
                extracted.append(fea)
        if len(extracted) > 1:
            return np.concatenate(extracted, axis=1) # concatenate columns
        else: 
            return extracted[0]

    def fit_transform(self, X, y=None):
        extracted = []
        for feature_name, column_name, extractor in self.features:
            fea = extractor.fit_transform(X[column_name], y)
            if hasattr(fea, "toarray"):
                extracted.append(fea.toarray())
            else:
                extracted.append(fea)
        if len(extracted) > 1:
            return np.concatenate(extracted, axis=1)
        else: 
            return extracted[0]

In [None]:
# get uni-gram CountVectorizer
def get_unigram_CountVectorizer(max_features):
    return CountVectorizer(max_features=max_features)

# get uni-gram TfidfVectorizer
def get_unigram_TfidfVectorizer(max_features):
    return TfidfVectorizer(max_features=max_features)

# get binary uni-gram CountVectorizer for models like Bernoulli Naive Bayes
def get_binary_CountVectorizer(max_features):
    return CountVectorizer(binary=True, max_features=max_features)

In [None]:
# define some functions for building a data pipeline
def default_feature_extractor():
    features = [('FullDescription-Bag of Words', 'FullDescription', CountVectorizer(max_features=100)),
                ('Title-Bag of Words', 'Title', CountVectorizer(max_features=100)),
                ('LocationRaw-Bag of Words', 'LocationRaw', CountVectorizer(max_features=100)),
                ('LocationNormalized-Bag of Words', 'LocationNormalized', CountVectorizer(max_features=100))]
    # max_features: build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
    combined = FeatureMapper(features)
    return combined

def default_classifier():
    return RandomForestRegressor(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=30, random_state=3465343)

# get a pipeline to automatically extract features and train models
def get_pipeline(features, clf):
    steps = [("extract_features", features), ("classify", clf)] # use memory parameter of pipeline to cache transformer since fitting 
                                                                # transformers could be expensive.
    return Pipeline(steps)

In [None]:
# feature selection
# remove Id, LocationRaw, Company, SalaryRaw, SourceName variables before model training
train = train_data.drop(columns=["Id", "LocationRaw", "Company", "SalaryRaw", "SourceName"])

# treat SalaryNormalized as target variable
X = train.drop(columns="SalaryNormalized")
y = train_data.SalaryNormalized

In [None]:
# train h2o deep network model
import h2o
h2o.init()

In [None]:
# train xgboost model
import xgboost as xgb
data = X
label = pandas.DataFrame(y)
dtrain = xgb.DMatrix(data, label=label)

# missing values can be replaced by a default value in the DMatrix constructor
# dtrain = xgb.DMatrix(data, label=label, missing=-999.0)


In [None]:
# train and save model
print("Reading in the training data")
train = get_train_df()

print("Extracting features and training model")
benchmark_pipeline = get_pipeline(default_feature_extractor(), default_classifier())
benchmark_pipeline.fit(train, train["SalaryNormalized"])

# print("Saving the fitted pipeline")
# save_model(benchmark_pipeline, "benchmark_pipeline.joblib")

In [None]:
# make predictions for a model
print("Loading the classifier")
# classifier = load_model("benchmark_pipeline.joblib")
classifier = benchmark_pipeline

print("Making predictions on train set")
train = get_train_df()
y_pred_test_benchmark = classifier.predict(train)

print("Making predictions on validation set") 
valid = get_valid_df()
y_pred_valid_benchmark = classifier.predict(valid) # a single line to apply transform and predict, no fit.
                                        # the same transforming (as to the train set) to the validation set and predict salary.
# y_pred_valid_benchmark = y_pred_valid_benchmark.reshape(len(y_pred_valid_benchmark), 1)

print("Making predictions on test set")
test = get_test_df()
y_pred_test_benchmark = classifier.predict(test)

# write predictions into local csv file for submission
# print("Writing predictions to file")
# write_submission(ids, predictions, filename)

In [None]:
# print performance metrics for each model on train set
train = get_train_df()
def get_mae_train(clf_name, predictions):
    print("The MAE for %s on train set is %.4f" % (clf_name, metrics.mean_absolute_error(train.SalaryNormalized, predictions)))
    
get_mae_train("xgboost", y_pred_train_xgboost)

In [None]:
# print performance metrics for each model on validation set
valid = get_valid_df()
def get_mae_valid(clf_name, predictions):
    print("The MAE for %s on validation set is %.4f" % (clf_name, metrics.mean_absolute_error(valid.SalaryNormalized, predictions)))
    
get_mae_valid("xgboost", y_pred_valid_xgboost)

In [None]:
# print performance metrics for each model on test set
test = get_test_df()
def get_mae_test(clf_name, predictions):
    print("The MAE for %s on test set is %.4f" % (clf_name, metrics.mean_absolute_error(test.SalaryNormalized, predictions)))
    
get_mae_test("xgboost", y_pred_test_xgboost)