# Matching Algorithm Model Comparison

Here you will find a comprehensive comparison using our existing match data and testing on various models along with annotation describing each model. The next chunk of code here is just importing various libraries as well as loading in our data.

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from numpy.random import randn
from numpy.matlib import repmat

from scipy.stats import norm
from scipy.optimize import fmin
from scipy.special import erf

from patsy import dmatrices

from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegressionCV, LinearRegression, LassoCV, RidgeClassifierCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.feature_selection import chi2
from sklearn.ensemble import GradientBoostingClassifier

In [24]:
firms = pd.read_csv('../../match-data/match-v1/firms.csv')
jobs = pd.read_csv('../../match-data/match-v1/job-openings.csv')
jobs_parent = pd.read_csv('../../match-data/match-v1/job-openings-parent.csv')
job_seekers = pd.read_csv('../../match-data/match-v1/job-seekers.csv')
job_seekers = job_seekers[job_seekers['closed'] == False]
matches = pd.read_csv('../../match-data/match-v1/matches.csv')
matches_parent = pd.read_csv('../../match-data/match-v1/matches-parent.csv')

ignore = ['number', 'caseid', 'parent_caseid', 'job_id', 'hired_yes_no', 'quit', 'fired']

job_seekers.columns = ['JS-' + c if c not in ignore else c for c in job_seekers.columns]
firms.columns = ['JOB-' + c if c not in ignore else c for c in firms.columns]
jobs.columns = ['JOB-' + c if c not in ignore else c for c in jobs.columns]
matches.columns = ['MATCH-' + c if c not in ignore else c for c in matches.columns]

matches_merged = pd.merge(matches, matches_parent, on='number')
jobs_merged = pd.merge(jobs, jobs_parent, on='number')

In [25]:
job_seekers = pd.read_csv('../../match-data/thompson.csv')
below = ['none','primary', 'intermediate', 'secondary']
#job_seekers['above_secondary']

In [26]:
def above_below(row):
    if str(row['highest_edu_level']) in below:
        print(row['highest_edu_level'], 0)
        return 0
    else:
        print(row['highest_edu_level'], 1)
        return 1

In [27]:
job_seekers['above_secondary_edu'] = job_seekers.apply(above_below, axis=1)

('diploma', 1)
('intermediate', 0)
('bachelors', 1)
('primary', 0)
('primary', 0)
('secondary', 0)
('secondary', 0)
('diploma', 1)
(nan, 1)
('primary', 0)
('secondary', 0)
('secondary', 0)
('other', 1)
(nan, 1)
('secondary', 0)
('secondary', 0)
('secondary', 0)
('diploma', 1)
('secondary', 0)
('primary', 0)
('primary', 0)
('secondary', 0)
('bachelors', 1)
('bachelors', 1)
('secondary', 0)
('secondary', 0)
('secondary', 0)
('college', 1)
('secondary', 0)
('bachelors', 1)
('primary', 0)
('intermediate', 0)
('primary', 0)
('bachelors', 1)
('primary', 0)
('secondary', 0)
('primary', 0)
('primary', 0)
('bachelors', 1)
('primary', 0)
('diploma', 1)
('primary', 0)
('primary', 0)
('secondary', 0)
('secondary', 0)
('primary', 0)
('primary', 0)
('secondary', 0)
('primary', 0)
(nan, 1)
('secondary', 0)
('diploma', 1)
('secondary', 0)
('diploma', 1)
('secondary', 0)
('primary', 0)
('primary', 0)
('primary', 0)
('primary', 0)
('primary', 0)
('secondary', 0)
('secondary', 0)
('secondary', 0)
('bache

('intermediate', 0)
('bachelors', 1)
('primary', 0)
('primary', 0)
('primary', 0)
('primary', 0)
('secondary', 0)
('secondary', 0)
('primary', 0)
('intermediate', 0)
('secondary', 0)
('none', 0)
('bachelors', 1)
('bachelors', 1)
('primary', 0)
('primary', 0)
('bachelors', 1)
('secondary', 0)
('secondary', 0)
('primary', 0)
('primary', 0)
('intermediate', 0)
('intermediate', 0)
('secondary', 0)
('secondary', 0)
('secondary', 0)
('secondary', 0)
('intermediate', 0)
('secondary', 0)
('secondary', 0)
('bachelors', 1)
('secondary', 0)
('primary', 0)
('primary', 0)
('diploma', 1)
('intermediate', 0)
('secondary', 0)
('secondary', 0)
('secondary', 0)
('bachelors', 1)
('none', 0)
('primary', 0)
('primary', 0)
('secondary', 0)
('college', 1)
('bachelors', 1)
('bachelors', 1)
('bachelors', 1)
('bachelors', 1)
('masters', 1)
('intermediate', 0)
('primary', 0)
('primary', 0)
('primary', 0)
('primary', 0)
('primary', 0)
('bachelors', 1)
('secondary', 0)
('primary', 0)
('secondary', 0)
('bachelors',

('intermediate', 0)
('bachelors', 1)
('bachelors', 1)
('primary', 0)
('intermediate', 0)
('bachelors', 1)
('primary', 0)
('secondary', 0)
('primary', 0)
('secondary', 0)
('primary', 0)
('intermediate', 0)
('secondary', 0)
('intermediate', 0)
('intermediate', 0)
('secondary', 0)
('diploma', 1)
('primary', 0)
('secondary', 0)
('intermediate', 0)
('intermediate', 0)
('secondary', 0)
('secondary', 0)
('bachelors', 1)
('diploma', 1)
('primary', 0)
('secondary', 0)
('primary', 0)
('secondary', 0)
('secondary', 0)
('intermediate', 0)
('primary', 0)
('secondary', 0)
('primary', 0)
('bachelors', 1)
('primary', 0)
('primary', 0)
('intermediate', 0)
('primary', 0)
('intermediate', 0)
('primary', 0)
('intermediate', 0)
('secondary', 0)
('college', 1)
('intermediate', 0)
('intermediate', 0)
('secondary', 0)
('primary', 0)
('intermediate', 0)
('diploma', 1)
('diploma', 1)
('primary', 0)
('bachelors', 1)
('secondary', 0)
('secondary', 0)
('bachelors', 1)
('intermediate', 0)
('secondary', 0)
('interme

In [28]:
job_seekers.to_csv('../../match-data/thompson-after.csv')