# Job Posting Data Acquisition and EDA

In [81]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.metrics import confusion_matrix
# from gensim.models import Word2Vec
# from nltk import word_tokenize
# import matplotlib.pyplot as plt
# import seaborn as sns
import re

from dfunc import df_info
from dfunc import chi_sq
from dfunc import feat_to_dum

np.random.seed(0)
pd.set_option('display.max_columns', 300)

In [2]:
df = pd.read_csv('fake_job_postings.csv', index_col=0)

## Initial Data Cleaning/Engineering and EDA
- Total Observations: 17880 rows
- Total Features: 16 columns
- Target Variable: 'fraudulent', 0 is real, 1 is false
    - 0: 17014
    - 1: 866
- Features to drop:
    - 'title': No standardization of naming job titles, >1000 different titles
- Categorical Features: 'location', 'salary_range', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function'
- NLP Features: 'company_profile', 'description', 'requirements', 'benefits'

In [3]:
target = 'fraudulent'

In [4]:
df_info(df, target)

Total Observations: 17880
Target Variable: fraudulent
Classes: 2
Imbalance: 0 - 17014, 1 - 866
Imbalance Ratio: 0 - 95.16%, 1 - 4.84%

No missing values: title, telecommuting, has_company_logo, has_questions, fraudulent

Values Missing:
---------------
location: 346 (1.94%)
department: 11547 (64.58%) ***
salary_range: 15012 (83.96%) ***
company_profile: 3308 (18.5%)
description: 1 (0.01%)
requirements: 2695 (15.07%)
benefits: 7210 (40.32%) ***
employment_type: 3471 (19.41%)
required_experience: 7050 (39.43%) ***
required_education: 8105 (45.33%) ***
industry: 4903 (27.42%) ***
function: 6455 (36.1%) ***


In [5]:
# Drop 'title'
df.drop(columns=['title'], inplace=True)

### Narrow down to US job postings

In [6]:
# Only keeping US job postings
df = df.loc[df['location'].str[:2] == 'US']

### Run Chi-squared tests on features with missing values
- Null Hypotheses: The proportions of false job reportings for null feature values and non-null feature values are equal
- Drop 'function', too many categories, too many missing values, low chi-sq
- Drop 'industry, too many categories, too many missing values, low chi-sq

In [7]:
chi_sq(df, feature='department', target=target)

Reject Null Hypothesis
Chi-Squared: 66.3287
p-value: 0.0

Target True when NaN: 5.59%
Target False when NaN: 94.41%
Target True when non-null: 10.01%
Target False when non-null: 89.99%


In [8]:
chi_sq(df, feature='required_education', target=target)

Reject Null Hypothesis
Chi-Squared: 12.3865
p-value: 0.002

Target True when NaN: 7.86%
Target False when NaN: 92.14%
Target True when non-null: 6.12%
Target False when non-null: 93.88%


In [9]:
chi_sq(df, feature='required_experience', target=target)

Reject Null Hypothesis
Chi-Squared: 16.2374
p-value: 0.0003

Target True when NaN: 8.05%
Target False when NaN: 91.95%
Target True when non-null: 6.04%
Target False when non-null: 93.96%


In [10]:
chi_sq(df, feature='industry', target=target)

Cannot Reject Null Hypothesis
Chi-Squared: 1.8199
p-value: 0.4025

Target True when NaN: 7.42%
Target False when NaN: 92.58%
Target True when non-null: 6.66%
Target False when non-null: 93.34%


In [11]:
chi_sq(df, feature='function', target=target)

Cannot Reject Null Hypothesis
Chi-Squared: 2.914
p-value: 0.2329

Target True when NaN: 6.3%
Target False when NaN: 93.7%
Target True when non-null: 7.17%
Target False when non-null: 92.83%


In [12]:
chi_sq(df, feature='employment_type', target=target)

Reject Null Hypothesis
Chi-Squared: 48.2519
p-value: 0.0

Target True when NaN: 10.67%
Target False when NaN: 89.33%
Target True when non-null: 6.09%
Target False when non-null: 93.91%


In [13]:
chi_sq(df, feature='salary_range', target=target)

Reject Null Hypothesis
Chi-Squared: 65.1333
p-value: 0.0

Target True when NaN: 6.03%
Target False when NaN: 93.97%
Target True when non-null: 11.62%
Target False when non-null: 88.38%


In [14]:
# Drop columns
df.drop(columns=['industry', 'function'], inplace=True)

### Salary range feature
- Ratio of fake to real job postings is much greater in postings that include salary range
- Convert feature to whether or not salary is posted

In [15]:
# Create salary dummy
df['salary_range'] = np.where(df['salary_range'].isna() == True, 0, 1)

### Clean location feature
- Replace with state dummies

In [16]:
# Create 'state' feature, if no state exists then 'no state'
condition = df['location'].str.extract(r'([A-Z]{2}(?<!US))').notnull()
value = df['location'].str.extract(r'([A-Z]{2}(?<!US))')
df['state'] = np.where(condition, value, 'No State')
df['state'] = np.where((df['state'] == 'AU') | (df['state'] == 'LO'), 'No State', df['state'])
df.drop(columns='location', inplace=True)

In [17]:
df = feat_to_dum(df, 'state', s_value='Unspecified', pref=None)

Feature Dummied and Dropped: state


### Department feature
- Convert to dummy, too many different categories with no standardization

In [18]:
# Convert department to dummy
df['department'] = np.where(df['department'].isna() == True, 0, 1)

### Remaining features
- Create 'Unspecified' category for Nan values, dummy, the drop column

In [19]:
df = feat_to_dum(df, 'employment_type', s_value='Unspecified', pref='et')

Feature Dummied and Dropped: employment_type


In [20]:
df = feat_to_dum(df, 'required_experience', s_value='Unspecified', pref='rex')

Feature Dummied and Dropped: required_experience


In [21]:
df = feat_to_dum(df, 'required_education', s_value='Unspecified', pref='red')

Feature Dummied and Dropped: required_education


In [24]:
y = df[target]
X = df.drop(columns=[target, 'description', 'company_profile', 'requirements', 'benefits'])

In [46]:
pca_1 = PCA(n_components=20)
pca_2 = PCA(n_components=40)
pca_3 = PCA(n_components=60)

principalComponents = pca_1.fit_transform(X)
principalComponents = pca_2.fit_transform(X)
principalComponents = pca_3.fit_transform(X)

print(np.sum(pca_1.explained_variance_ratio_))
print(np.sum(pca_2.explained_variance_ratio_))
print(np.sum(pca_3.explained_variance_ratio_))

pca = PCA(n_components=50)
principalComponents = pca.fit_transform(X)
print(np.sum(pca.explained_variance_ratio_))

0.841464702232478
0.9548814611159887
0.9931092649986493
0.979972499631415


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [47]:
pipe_lr = Pipeline([('pca', PCA(n_components=50, random_state=123)),
                    ('clf', LogisticRegression(random_state=123))])

pipe_lr.fit(X_train, y_train)

print(pipe_lr.score(X_test, y_test))

0.9322073657049027


In [49]:
pipe_svm = Pipeline([('pca', PCA(n_components=50)),
                     ('clf', svm.SVC(random_state=123))])
        
pipe_tree = Pipeline([('pca', PCA(n_components=50)),
                      ('clf', tree.DecisionTreeClassifier(random_state=123))])

pipe_rf = Pipeline([('pca', PCA(n_components=50)),
                    ('clf', RandomForestClassifier(random_state=123))])

pipelines = [pipe_svm, pipe_tree, pipe_rf]
pipeline_names = ['Support Vector Machine','Decision Tree','Random Forest']

# Loop to fit each of the three pipelines
for pipe in pipelines:
    print(pipe)
    pipe.fit(X_train, y_train)

# Compare accuracies
for index, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipeline_names[index], val.score(X_test, y_test)))

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=50,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=123, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)
Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=50,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth

In [68]:
# pipe_rf = Pipeline([('pca', PCA(n_components=50)),
#                     ('clf', RandomForestClassifier(random_state = 123))])

rf_model = RandomForestClassifier()
# Set grid search params
param_grid_forest = {'n_estimators': [100, 150, 200],
                     'criterion': ['entropy', 'gini'],
                     'max_depth': [4, 5, 6],
                     'min_samples_leaf':[0.05 ,0.1, 0.2],
                     'min_samples_split':[0.05 ,0.1, 0.2]}

# Construct grid search
gs_rf = GridSearchCV(estimator=rf_model,
                     param_grid=param_grid_forest,
                     scoring='f1', cv=5, n_jobs=-1,
                     verbose=1, return_train_score = True)

# Fit using grid search
gs_rf.fit(X_train, y_train)

# Best params
print('\nBest params:\n', gs_rf.best_params_)

gs_rf_model = gs_rf.best_estimator_

preds = gs_rf_model.predict(X_test)
test_f1 = f1_score(y_test, preds)
test_acc = accuracy_score(y_test, preds)

print("Accuracy: %f" % (test_acc))
print("F1: %f" % (test_f1))

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   47.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:  1.4min finished



Best params:
 {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 0.05, 'min_samples_split': 0.05, 'n_estimators': 100}
Accuracy: 0.929627
F1: 0.000000


In [69]:
pipe_svm = Pipeline([('pca', PCA(n_components=50)),
                     ('clf', svm.SVC(random_state=123))])

# Set grid search params
param_grid_svm = [{'clf__C': [0.1, 1, 10]  , 'clf__kernel': ['linear']},
                  {'clf__C': [1, 10], 'clf__gamma': [0.001, 0.01], 'clf__kernel': ['rbf']}]

# Construct grid search
gs_svm = GridSearchCV(estimator=pipe_svm,
                      param_grid=param_grid_svm,
                      scoring='f1', cv=5, n_jobs=-1,
                      verbose=1, return_train_score = True)

# Fit using grid search
gs_svm.fit(X_train, y_train)


# Best params
print('\nBest params:\n', gs_svm.best_params_)

preds = gs_svm.best_estimator_.predict(X_test)

test_f1 = f1_score(y_test, preds)
test_acc = accuracy_score(y_test, preds)

print("Accuracy: %f" % (test_acc))
print("F1: %f" % (test_f1))

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    7.8s finished



Best params:
 {'clf__C': 0.1, 'clf__kernel': 'linear'}
Accuracy: 0.929627
F1: 0.000000


In [63]:
xgb_params = {'n_estimators': [150, 200, 300, 400],
              'learning_rate': [0.1, 0.09],
              'max_depth': [7, 8, 9],
              'colsample_bytree': [0.8],
              'min_child_weight': [1]}

In [64]:
xgb_model = xgb.XGBClassifier()
gs_xgb = GridSearchCV(estimator=xgb_model,
                      param_grid=xgb_params,
                      scoring='f1',
                      n_jobs=-1,
                      verbose=1,
                      cv=5)

gs_xgb.fit(X_train, y_train)
xgb_model = gs_xgb.best_estimator_

preds = xgb_model.predict(X_test)
test_f1 = f1_score(y_test, preds)
test_acc = accuracy_score(y_test, preds)
print('\nBest params:\n', gs_xgb.best_params_)
print("Accuracy: %f" % (test_acc))
print("F1: %f" % (test_f1))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  4.9min finished


Accuracy: 0.941121
F1: 0.450766


In [70]:
preds = xgb_model.predict(X_test)

In [72]:
sum(preds)

157

In [74]:
confusion_matrix(y_test, preds)

array([[3909,   54],
       [ 197,  103]])

In [83]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      3963
           1       0.66      0.34      0.45       300

    accuracy                           0.94      4263
   macro avg       0.80      0.66      0.71      4263
weighted avg       0.93      0.94      0.93      4263



In [79]:
len(y_test)

4263

In [80]:
df_info(df, target)

Total Observations: 10656
Target Variable: fraudulent
Classes: 2
Imbalance: 0 - 9926, 1 - 730
Imbalance Ratio: 0 - 93.15%, 1 - 6.85%

No missing values: department, salary_range, description, telecommuting, has_company_logo, has_questions, fraudulent, AL, AR, AZ, CA, CO, CT, DC, DE, FL, GA, HI, IA, ID, IL, IN, KS, KY, LA, MA, MD, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV, NY, No State, OH, OK, OR, PA, RI, SC, SD, TN, TX, UT, VA, VT, WA, WI, WV, WY, et_Full-time, et_Other, et_Part-time, et_Temporary, et_Unspecified, rex_Director, rex_Entry level, rex_Executive, rex_Internship, rex_Mid-Senior level, rex_Not Applicable, rex_Unspecified, red_Bachelor's Degree, red_Certification, red_Doctorate, red_High School or equivalent, red_Master's Degree, red_Professional, red_Some College Coursework Completed, red_Some High School Coursework, red_Unspecified, red_Vocational, red_Vocational - Degree, red_Vocational - HS Diploma

Values Missing:
---------------
company_profile: 2076 (19.48%)
