# Job Posting Data Acquisition and EDA

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from gensim.models import Word2Vec
from nltk import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
import re
from imblearn.over_sampling import SMOTE
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import string
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

from dfunc import df_info
from dfunc import chi_sq
from dfunc import feat_to_dum
from dfunc import get_scores

np.random.seed(0)
pd.set_option('display.max_columns', 300)

Using TensorFlow backend.


In [2]:
df = pd.read_csv('fake_job_postings.csv', index_col=0)

## Initial Data Cleaning/Engineering and EDA
- Total Observations: 17880 rows
- Total Features: 16 columns
- Target Variable: 'fraudulent', 0 is real, 1 is false
    - 0: 17014
    - 1: 866
- Features to drop:
    - 'title': No standardization of naming job titles, >1000 different titles
- Categorical Features: 'location', 'salary_range', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function'
- NLP Features: 'company_profile', 'description', 'requirements', 'benefits'

In [3]:
target = 'fraudulent'

In [4]:
df_info(df, target)

Total Observations: 17880
Target Variable: fraudulent
Classes: 2
Imbalance: 0 - 17014, 1 - 866
Imbalance Ratio: 0 - 95.16%, 1 - 4.84%

No missing values: title, telecommuting, has_company_logo, has_questions, fraudulent

Values Missing:
---------------
location: 346 (1.94%)
department: 11547 (64.58%) ***
salary_range: 15012 (83.96%) ***
company_profile: 3308 (18.5%)
description: 1 (0.01%)
requirements: 2695 (15.07%)
benefits: 7210 (40.32%) ***
employment_type: 3471 (19.41%)
required_experience: 7050 (39.43%) ***
required_education: 8105 (45.33%) ***
industry: 4903 (27.42%) ***
function: 6455 (36.1%) ***


In [5]:
# Drop 'title'
df.drop(columns=['title'], inplace=True)

### Narrow down to US job postings

In [6]:
# Only keeping US job postings
df = df.loc[df['location'].str[:2] == 'US']

### Run Chi-squared tests on features with missing values
- Null Hypotheses: The proportions of false job reportings for null feature values and non-null feature values are equal
- Drop 'function', too many categories, too many missing values, low chi-sq
- Drop 'industry, too many categories, too many missing values, low chi-sq

In [None]:
chi_sq(df, feature='department', target=target)

In [None]:
chi_sq(df, feature='required_education', target=target)

In [None]:
chi_sq(df, feature='required_experience', target=target)

In [None]:
chi_sq(df, feature='industry', target=target)

In [None]:
chi_sq(df, feature='function', target=target)

In [None]:
chi_sq(df, feature='employment_type', target=target)

In [None]:
chi_sq(df, feature='salary_range', target=target)

In [None]:
chi_sq(df, feature='company_profile', target=target)

In [None]:
chi_sq(df, feature='requirements', target=target)

In [None]:
chi_sq(df, feature='benefits', target=target)

In [7]:
# Drop columns
df.drop(columns=['industry', 'function', 'requirements', 'benefits'], inplace=True)

### Salary range feature
- Ratio of fake to real job postings is much greater in postings that include salary range
- Convert feature to whether or not salary is posted

In [8]:
# Create salary dummy
df['salary_range'] = np.where(df['salary_range'].isna() == True, 0, 1)

### Clean location feature
- Replace with state dummies

In [9]:
# Create 'state' feature, if no state exists then 'no state'
condition = df['location'].str.extract(r'([A-Z]{2}(?<!US))').notnull()
value = df['location'].str.extract(r'([A-Z]{2}(?<!US))')
df['state'] = np.where(condition, value, 'No State')
df['state'] = np.where((df['state'] == 'AU') | (df['state'] == 'LO'), 'No State', df['state'])
df.drop(columns='location', inplace=True)

In [10]:
df = feat_to_dum(df, 'state', s_value='Unspecified', pref=None)

Feature Dummied and Dropped: state


### Department feature
- Convert to dummy, too many different categories with no standardization

In [11]:
# Convert department to dummy
df['department'] = np.where(df['department'].isna() == True, 0, 1)

### Company profile feature
- Convert to dummy

In [12]:
df['company_profile'] = np.where(df['company_profile'].isna() == True, 0, 1)

### Remaining features
- Create 'Unspecified' category for Nan values, dummy, the drop column

In [13]:
df = feat_to_dum(df, 'employment_type', s_value='Unspecified', pref='et')

Feature Dummied and Dropped: employment_type


In [14]:
df = feat_to_dum(df, 'required_experience', s_value='Unspecified', pref='rex')

Feature Dummied and Dropped: required_experience


In [15]:
df = feat_to_dum(df, 'required_education', s_value='Unspecified', pref='red')

Feature Dummied and Dropped: required_education


# Modeling

In [16]:
y = df[target]
X = df.drop(columns=target)

In [18]:
pca_1 = PCA(n_components=20)
pca_2 = PCA(n_components=40)
pca_3 = PCA(n_components=60)

principalComponents = pca_1.fit_transform(X.drop(columns='description'))
principalComponents = pca_2.fit_transform(X.drop(columns='description'))
principalComponents = pca_3.fit_transform(X.drop(columns='description'))

print(np.sum(pca_1.explained_variance_ratio_))
print(np.sum(pca_2.explained_variance_ratio_))
print(np.sum(pca_3.explained_variance_ratio_))

pca = PCA(n_components=50)
principalComponents = pca.fit_transform(X.drop(columns='description'))
print(np.sum(pca.explained_variance_ratio_))

0.8397017964357044
0.9538218606326755
0.9926936532656117
0.9789828362653149


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [20]:
# Create splits for text data vs. non-text data
X_train_w = X_train[['description']]
X_test_w = X_test[['description']]
X_train_n = X_train.drop(columns='description')
X_test_n = X_test.drop(columns='description')

## Baseline Models

### Logistic Regression

In [21]:
pipe_lr = Pipeline([('pca', PCA(n_components=50)),
                    ('clf', LogisticRegression())])

pipe_lr.fit(X_train_n, y_train)

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=50,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [22]:
get_scores(pipe_lr, X_test_n, y_test)

F1 Score: 0.208955223880597
Accuracy: 0.937837203847056
[[3963   26]
 [ 239   35]]


### RandomForest

In [23]:
# Instantiate RandomForest pipeline
pipe_rf = Pipeline([('pca', PCA(n_components=50)),
                    ('clf', RandomForestClassifier())])

# Set grid search params
param_grid_forest = {'clf__n_estimators': [175, 200, 225, 250],
                     'clf__criterion': ['gini'],
                     'clf__max_depth': [6],
                     'clf__class_weight': ['balanced', 'balanced_subsample']}

# Construct grid search
gs_rf = GridSearchCV(estimator=pipe_rf,
                     param_grid=param_grid_forest,
                     scoring='f1', cv=5, n_jobs=-1,
                     verbose=1, return_train_score = True)

# Fit using grid search
gs_rf.fit(X_train_n, y_train)
rf_mod = gs_rf.best_estimator_

# Best params
print('\nBest params:\n', gs_rf.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   27.1s finished



Best params:
 {'clf__class_weight': 'balanced', 'clf__criterion': 'gini', 'clf__max_depth': 6, 'clf__n_estimators': 225}


In [24]:
get_scores(gs_rf, X_test_n, y_test)

F1 Score: 0.5393258426966292
Accuracy: 0.9134412385643913
[[3678  311]
 [  58  216]]


### XGBoost

In [25]:
# XGBoost with GridSearchCV params
xgb_params = {'clf__n_estimators': [150, 175],
              'clf__learning_rate': [0.08, 0.07],
              'clf__max_depth': [8],
              'clf__colsample_bytree': [0.8, 0.7],
              'clf__min_child_weight': [1]}

# Instantiate XGBoost pipeline
pipe_xgb = Pipeline([('pca', PCA(n_components=50)),
                     ('clf', xgb.XGBClassifier())])

gs_xgb = GridSearchCV(estimator=pipe_xgb,
                      param_grid=xgb_params,
                      scoring='f1', n_jobs=-1,
                      verbose=1, cv=5)

gs_xgb.fit(X_train_n, y_train)
xgb_mod = gs_xgb.best_estimator_

print('\nBest params:\n', gs_xgb.best_params_)

In [27]:
get_scores(xgb_mod, X_test_n, y_test)

F1 Score: 0.5261044176706827
Accuracy: 0.9446399249354914
[[3896   93]
 [ 143  131]]


## NLP

### Data Prep

In [28]:
# Tokenize descriptions
data = X_train_w['description'].map(word_tokenize).values
data_test = X_test_w['description'].map(word_tokenize).values

In [31]:
stops = stopwords.words('english')

In [32]:
# Create vocabulary from training data
vocab = [ [ word for word in simple_preprocess(str(doc)) if word not in stops ] for doc in data ]
total_vocabulary = set(word for description in vocab for word in description)

In [33]:
# Retrieve vocabulary word vectors from GloVe
glove = {}
with open('glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [34]:
# Stolen from learn.co "Classification With Word Embeddings - Codealong" lab
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [35]:
# Create DataFrames where columns are vector dimensional values from a mean document vector
w2v = W2vVectorizer(glove)
vec = w2v.transform(data)
vec_test = w2v.transform(data_test)
vcdf_train = pd.DataFrame(vec)
vcdf_test = pd.DataFrame(vec_test)

In [None]:
X_test_n.shape

In [36]:
# Join vector columns with previous data
X_train_final = X_train_n.reset_index().drop(columns='job_id').join(vcdf_train)
X_test_final = X_test_n.reset_index().drop(columns='job_id').join(vcdf_test)

### RandomForest with word vec

In [None]:
rf_w2v_mod = RandomForestClassifier()

In [40]:
# Set grid search params
param_grid_forest = {'n_estimators': [200, 225, 250],
                     'class_weight': ['balanced', 'balanced_subsample', None],
                     ''
                     'criterion': ['gini']}

# Construct grid search
rf_wv = GridSearchCV(estimator=rf_w2v_mod,
                     param_grid=param_grid_forest,
                     scoring='f1', cv=5, n_jobs=-1,
                     verbose=1, return_train_score = True)

# Fit using grid search
rf_wv.fit(X_train_final, y_train)
rfwv_final = rf_wv.best_estimator_

# Best params
print('\nBest params:\n', rfwv.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   37.8s finished



Best params:
 {'clf__class_weight': 'balanced', 'clf__criterion': 'gini', 'clf__max_depth': 6, 'clf__n_estimators': 225}


In [41]:
get_scores(rfwv_final, X_test_final, y_test)

F1 Score: 0.5933503836317136
Accuracy: 0.9627023223082336
[[3988    1]
 [ 158  116]]


In [55]:
# Instantiate RandomForest pipeline
pipe_rfwv = Pipeline([('pca', PCA(n_components=90)),
                      ('clf', RandomForestClassifier())])

# Set grid search params
param_grid_forest = {'clf__n_estimators': [200, 225, 250],
                     'clf__criterion': ['gini', 'entropy'],
                     'clf__max_depth': [6, 7, 8],
                     'clf__class_weight': ['balanced', 'balanced_subsample']}

# Construct grid search
gs_rfwv = GridSearchCV(estimator=pipe_rfwv,
                       param_grid=param_grid_forest,
                       scoring='f1', cv=5, n_jobs=-1,
                       verbose=1, return_train_score = True)

# Fit using grid search
gs_rfwv.fit(X_train_final, y_train)
pipe_rfwv_mod = gs_rfwv.best_estimator_

# Best params
print('\nBest params:\n', gs_rfwv.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   51.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  5.0min finished



Best params:
 {'clf__class_weight': 'balanced_subsample', 'clf__criterion': 'entropy', 'clf__max_depth': 7, 'clf__n_estimators': 225}


In [56]:
get_scores(pipe_rfwv_mod, X_test_final, y_test)

F1 Score: 0.6605504587155963
Accuracy: 0.9566033309875674
[[3898   91]
 [  94  180]]


### XGBoost with word vec

In [37]:
xgb_w2v_mod = xgb.XGBClassifier()

In [None]:
# XGBoost with GridSearchCV params
xgb_params = {'clf__n_estimators': [150, 175, 200],
              'clf__learning_rate': [0.09, 0.08, 0.07],
              'clf__max_depth': [7, 8, 9],
              'clf__colsample_bytree': [0.9, 0.8, 0.7],
              'clf__min_child_weight': [1, 0.9, 0.8]}

wv_xgb = GridSearchCV(estimator=xgb_w2v_mod,
                      param_grid=xgb_params,
                      scoring='f1', n_jobs=-1,
                      verbose=1, cv=5)

wv_xgb.fit(X_train_final, y_train)
xgbwv_mod = wv_xgb.best_estimator_

print('\nBest params:\n', wv_xgb.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.0min


In [None]:
get_scores(xgbwv_mod, X_test_final, y_test)

### Logistic Regression with word vec

In [None]:
pipe_lrwv = Pipeline([('pca', PCA(n_components=90)),
                      ('clf', LogisticRegression())])

pipe_lrwv.fit(X_train_final, y_train)

In [None]:
get_scores(pipe_lrwv, X_test_final, y_test)